diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 41fd633bc..cb69a70ac 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -72,12 +72,6 @@ references_i ## number of unique http references from same host to referenced url references_internal_i -## ids of unique http references from same host to referenced url -#references_internal_id_sxt - -## urls of unique http references from same host to referenced url -#references_internal_url_sxt - ## number of unique http references from external hosts references_external_i diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index aef6e5523..3e26b6ca7 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -42,6 +41,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; +import net.yacy.cora.storage.HandleSet; import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; @@ -51,6 +51,8 @@ import net.yacy.kelondro.logging.Log; import net.yacy.peers.graphics.WebStructureGraph.StructureEntry; import net.yacy.search.Switchboard; import net.yacy.search.index.Fulltext; +import net.yacy.search.index.Segment.ReferenceReport; +import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -274,8 +276,6 @@ public class HostBrowser { CollectionSchema.clickdepth_i.getSolrFieldName(), CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(), - CollectionSchema.references_internal_id_sxt.getSolrFieldName(), - CollectionSchema.references_internal_url_sxt.getSolrFieldName(), CollectionSchema.references_external_i.getSolrFieldName(), CollectionSchema.references_exthosts_i.getSolrFieldName(), CollectionSchema.cr_host_chance_d.getSolrFieldName(), @@ -289,13 +289,15 @@ public class HostBrowser { Map infoCache = new HashMap(); int hostsize = 0; final List deleteIDs = new ArrayList(); - long timeout = System.currentTimeMillis() + TIMEOUT; + long timeoutList = System.currentTimeMillis() + TIMEOUT; + long timeoutReferences = System.currentTimeMillis() + 3000; + ReferenceReportCache rrCache = sb.index.getReferenceReportCache(); while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); FailType error = errortype == null ? null : FailType.valueOf(errortype); String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), doc)); + infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences)); if (u.startsWith(path)) { if (delete) { deleteIDs.add(ids); @@ -315,7 +317,7 @@ public class HostBrowser { if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); } - // collect outboundlinks to browse to the outbound + // collect referrer links links = URIMetadataNode.getLinks(doc, false); while (links.hasNext()) { u = links.next(); @@ -332,7 +334,7 @@ public class HostBrowser { } catch (MalformedURLException e) {} } } - if (System.currentTimeMillis() > timeout) break; + if (System.currentTimeMillis() > timeoutList) break; } if (deleteIDs.size() > 0) sb.remove(deleteIDs); @@ -511,17 +513,13 @@ public class HostBrowser { public Integer cr_n; public Double cr_c; public int clickdepth, references, references_internal, references_external, references_exthosts; - public List references_internal_urls; - private final Fulltext fulltext; - public InfoCacheEntry(final Fulltext fulltext, final SolrDocument doc) { - this.fulltext = fulltext; + public List references_internal_urls, references_external_urls; + public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) { this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName()); this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName()); Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName()); Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); - Collection rc_internal_id = doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName()); - Collection rc_internal_url = doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName()); Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue(); @@ -529,21 +527,52 @@ public class HostBrowser { this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue(); // calculate the url reference list this.references_internal_urls = new ArrayList(); - if (rc_internal_url != null) { - for (Object o: rc_internal_url) references_internal_urls.add((String) o); - } else if (rc_internal_id != null) { - for (Object o: rc_internal_id) { - DigestURI u = fulltext.getURL(ASCII.getBytes((String) o)); - if (u != null) references_internal_urls.add(u.toNormalform(true)); + this.references_external_urls = new ArrayList(); + if (fetchReferences) { + // get the references from the citation index + try { + ReferenceReport rr = rrCache.getReferenceReport(ASCII.getBytes(urlhash), false); + List internalIDs = new ArrayList(); + List externalIDs = new ArrayList(); + HandleSet iids = rr.getInternallIDs(); + for (byte[] b: iids) internalIDs.add(ASCII.String(b)); + HandleSet eids = rr.getExternalIDs(); + for (byte[] b: eids) externalIDs.add(ASCII.String(b)); + // get all urls from the index and store them here + for (String id: internalIDs) { + if (id.equals(urlhash)) continue; // no self-references + DigestURI u = fulltext.getURL(ASCII.getBytes(id)); + if (u != null) references_internal_urls.add(u.toNormalform(true)); + } + for (String id: externalIDs) { + if (id.equals(urlhash)) continue; // no self-references + DigestURI u = fulltext.getURL(ASCII.getBytes(id)); + if (u != null) references_external_urls.add(u.toNormalform(true)); + } + } catch (IOException e) { } + } this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue(); this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue(); } public String toString() { - StringBuilder sb = new StringBuilder(); - for (String s: references_internal_urls) sb.append("info"); - if (sb.length() == 0 && !fulltext.getDefaultConfiguration().contains(CollectionSchema.references_internal_id_sxt)) sb.append("info"); + StringBuilder sbi = new StringBuilder(); + int c = 0; + for (String s: references_internal_urls) { + sbi.append("info"); + c++; + if (c % 80 == 0) sbi.append("
"); + } + if (sbi.length() > 0) sbi.insert(0, "
internal referrer:
"); + StringBuilder sbe = new StringBuilder(); + c = 0; + for (String s: references_external_urls) { + sbe.append("info"); + c++; + if (c % 80 == 0) sbe.append("
"); + } + if (sbe.length() > 0) sbe.insert(0, "
external referrer:
"); return (this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : @@ -551,7 +580,7 @@ public class HostBrowser { (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") + (this.cr_n != null ? ", crn=" + this.cr_n : "") + (this.references >= 0 ? - ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + (sb.length() > 0 ? " " + sb.toString() + "" : "") : + ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : ""); } } diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 399562c5d..27c4211d8 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -24,7 +24,6 @@ import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; -import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; @@ -38,7 +37,6 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.storage.Configuration; import net.yacy.cora.storage.HandleSet; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; @@ -94,15 +92,12 @@ public class SchemaConfiguration extends Configuration implements Serializable { return false; } - public boolean postprocessing_references(Fulltext fulltext, ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map hostExtentCount) { + public boolean postprocessing_references(ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map hostExtentCount) { if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) || - this.contains(CollectionSchema.references_internal_id_sxt) || this.contains(CollectionSchema.references_internal_url_sxt) || this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false; Integer all_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer internal_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); - Collection internal_ids_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName()); - Collection internal_urls_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName()); Integer external_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer exthosts_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); Integer hostextc_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName()); @@ -111,14 +106,6 @@ public class SchemaConfiguration extends Configuration implements Serializable { List internalIDs = new ArrayList(); HandleSet iids = rr.getInternallIDs(); for (byte[] b: iids) internalIDs.add(ASCII.String(b)); - List internalURLs = new ArrayList(); - if (this.contains(CollectionSchema.references_internal_url_sxt)) { - // get all urls from the index and store them here - for (String id: internalIDs) { - DigestURI u = fulltext.getURL(ASCII.getBytes(id)); - if (u != null) internalURLs.add(u.toNormalform(true)); - } - } boolean change = false; int all = rr.getExternalCount() + rr.getInternalCount(); @@ -132,16 +119,6 @@ public class SchemaConfiguration extends Configuration implements Serializable { sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), rr.getInternalCount()); change = true; } - if (this.contains(CollectionSchema.references_internal_id_sxt) && - (internal_ids_old == null || internal_ids_old.size() != internalIDs.size())) { - sid.setField(CollectionSchema.references_internal_id_sxt.getSolrFieldName(), internalIDs); - change = true; - } - if (this.contains(CollectionSchema.references_internal_url_sxt) && - (internal_urls_old == null || internal_urls_old.size() != internalURLs.size())) { - sid.setField(CollectionSchema.references_internal_url_sxt.getSolrFieldName(), internalURLs); - change = true; - } if (this.contains(CollectionSchema.references_external_i) && (external_old == null || external_old.intValue() != rr.getExternalCount())) { sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), rr.getExternalCount()); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 8abe8b42c..f6e122e3e 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -705,7 +705,7 @@ public class Segment { // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.connectedCitation()) { - this.fulltext.getDefaultConfiguration().postprocessing_references(this.fulltext, this.getReferenceReportCache(), null, vector, url, null); + this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), null, vector, url, null); } // STORE TO SOLR String error = null; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 418657f46..5ae5d4645 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -897,7 +897,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); hostExtentCache.put(hosthash, count); } - if (postprocessing_references(segment.fulltext(), rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++; + if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++; // all processing steps checked, remove the processing tag sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 5681915c9..07ba7877e 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -52,8 +52,6 @@ public enum CollectionSchema implements SchemaDeclaration { httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i"), references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"), - references_internal_id_sxt(SolrType.string, true, true, true, false, true, "ids of unique http references from same host to referenced url"), - references_internal_url_sxt(SolrType.string, true, true, true, false, true, "urls of unique http references from same host to referenced url"), references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"), references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"), clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),