From f7e77a21bf166172fa0e41d89125f46aea2d4831 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 7 Jun 2013 13:20:57 +0200 Subject: [PATCH] Added a citation reference computation for intra-domain link structures. While the values for the reference evaluation are computed, also a backlink-structure can be discovered and written to the index as well. The host browser has been extended to show such backlinks to each presented links. The host browser therefore can now show an information where an document is linked. The new citation reference is computed as likelyhood for a random click path with recursive usage of previously computed likelyhood. This process is repeated until the likelyhood converges to a specific number. This number is then normalized to a ranking value CRn, 0<=CRn<=1. The value CRn can therefore be used to rank popularity within intra-domain link structures. --- defaults/solr.collection.schema | 25 +- htroot/HostBrowser.java | 48 ++- .../yacy/cora/federate/solr/ProcessType.java | 2 +- .../federate/solr/SchemaConfiguration.java | 78 +++-- .../solr/connector/AbstractSolrConnector.java | 2 +- .../kelondro/data/meta/URIMetadataNode.java | 6 +- .../kelondro/workflow/WorkflowProcessor.java | 5 +- source/net/yacy/search/index/Segment.java | 107 ++++++- .../schema/CollectionConfiguration.java | 295 +++++++++++++++--- .../yacy/search/schema/CollectionSchema.java | 11 +- 10 files changed, 491 insertions(+), 88 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 7b7a5f030..23c6d5c66 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -69,9 +69,15 @@ httpstatus_i ## number of unique http references, should be equal to references_internal_i + references_external_i references_i -## number of unique http references from same host as referenced url +## number of unique http references from same host to referenced url references_internal_i +## ids of unique http references from same host to referenced url +#references_internal_id_sxt + +## urls of unique http references from same host to referenced url +#references_internal_url_sxt + ## number of unique http references from external hosts references_external_i @@ -93,8 +99,8 @@ load_date_dt ## date until resource shall be considered as fresh fresh_date_dt -## ids of referrer to this document -referrer_id_txt +## id of the referrer to this document, discovered during crawling +referrer_id_s ## the name of the publisher of the document publisher_t @@ -396,6 +402,19 @@ host_extent_i #opengraph_url_s #opengraph_image_s + +## citation ranking + +## the number of documents within a single host +#cr_host_count_i + +## the chance to click on this page when randomly clicking on links within on one host +#cr_host_chance_d + +## normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10 +#cr_host_norm_i + + ## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias #ext_cms_txt diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 64d508970..aef6e5523 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; +import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -273,8 +274,12 @@ public class HostBrowser { CollectionSchema.clickdepth_i.getSolrFieldName(), CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(), + CollectionSchema.references_internal_id_sxt.getSolrFieldName(), + CollectionSchema.references_internal_url_sxt.getSolrFieldName(), CollectionSchema.references_external_i.getSolrFieldName(), - CollectionSchema.references_exthosts_i.getSolrFieldName() + CollectionSchema.references_exthosts_i.getSolrFieldName(), + CollectionSchema.cr_host_chance_d.getSolrFieldName(), + CollectionSchema.cr_host_norm_i.getSolrFieldName() ); SolrDocument doc; Set storedDocs = new HashSet(); @@ -290,7 +295,7 @@ public class HostBrowser { String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); FailType error = errortype == null ? null : FailType.valueOf(errortype); String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - infoCache.put(ids, new InfoCacheEntry(doc)); + infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), doc)); if (u.startsWith(path)) { if (delete) { deleteIDs.add(ids); @@ -425,9 +430,7 @@ public class HostBrowser { if (type == StoreType.INDEX) { String ids = ASCII.String(uri.hash()); InfoCacheEntry ice = infoCache.get(ids); - prop.put("files_list_" + c + "_type_stored_comment", - (ice.references >= 0 ? "refs: " + ice.references_internal + " int, " + ice.references_external + " ext, " + ice.references_exthosts + " hosts" : "") + - (ice.references >= 0 && ice.clickdepth >= 0 ? ", " : "") + (ice.clickdepth >= 0 ? "clickdepth: " + ice.clickdepth : "")); + prop.put("files_list_" + c + "_type_stored_comment", ice.toString()); // ice.toString() contains html, therefore do not use putHTML here } prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0); if (error) { @@ -505,19 +508,52 @@ public class HostBrowser { } public static final class InfoCacheEntry { + public Integer cr_n; + public Double cr_c; public int clickdepth, references, references_internal, references_external, references_exthosts; - public InfoCacheEntry(final SolrDocument doc) { + public List references_internal_urls; + private final Fulltext fulltext; + public InfoCacheEntry(final Fulltext fulltext, final SolrDocument doc) { + this.fulltext = fulltext; + this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName()); + this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName()); Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName()); Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); + Collection rc_internal_id = doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName()); + Collection rc_internal_url = doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName()); Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue(); this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue(); this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue(); + // calculate the url reference list + this.references_internal_urls = new ArrayList(); + if (rc_internal_url != null) { + for (Object o: rc_internal_url) references_internal_urls.add((String) o); + } else if (rc_internal_id != null) { + for (Object o: rc_internal_id) { + DigestURI u = fulltext.getURL(ASCII.getBytes((String) o)); + if (u != null) references_internal_urls.add(u.toNormalform(true)); + } + } this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue(); this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue(); } + public String toString() { + StringBuilder sb = new StringBuilder(); + for (String s: references_internal_urls) sb.append("info"); + if (sb.length() == 0 && !fulltext.getDefaultConfiguration().contains(CollectionSchema.references_internal_id_sxt)) sb.append("info"); + return + (this.clickdepth >= 0 ? + "clickdepth: " + this.clickdepth : + "") + + (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") + + (this.cr_n != null ? ", crn=" + this.cr_n : "") + + (this.references >= 0 ? + ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + (sb.length() > 0 ? " " + sb.toString() + "" : "") : + ""); + } } } diff --git a/source/net/yacy/cora/federate/solr/ProcessType.java b/source/net/yacy/cora/federate/solr/ProcessType.java index 29365708a..cef9d1338 100644 --- a/source/net/yacy/cora/federate/solr/ProcessType.java +++ b/source/net/yacy/cora/federate/solr/ProcessType.java @@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr; */ public enum ProcessType { - CLICKDEPTH; + CLICKDEPTH, CITATION; } diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 14387752e..399562c5d 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -23,6 +23,8 @@ package net.yacy.cora.federate.solr; import java.io.File; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; @@ -32,16 +34,14 @@ import org.apache.log4j.Logger; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; -import net.yacy.cora.order.Base64Order; +import net.yacy.cora.document.ASCII; import net.yacy.cora.storage.Configuration; import net.yacy.cora.storage.HandleSet; -import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.index.RowHandleSet; -import net.yacy.kelondro.rwi.ReferenceContainer; -import net.yacy.kelondro.util.ByteBuffer; +import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment; +import net.yacy.search.index.Segment.ReferenceReport; +import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.schema.CollectionSchema; public class SchemaConfiguration extends Configuration implements Serializable { @@ -94,56 +94,72 @@ public class SchemaConfiguration extends Configuration implements Serializable { return false; } - public boolean postprocessing_references(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map hostExtentCount) { - if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) || + public boolean postprocessing_references(Fulltext fulltext, ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map hostExtentCount) { + if (!(this.contains(CollectionSchema.references_i) || + this.contains(CollectionSchema.references_internal_i) || + this.contains(CollectionSchema.references_internal_id_sxt) || this.contains(CollectionSchema.references_internal_url_sxt) || this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false; Integer all_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer internal_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); + Collection internal_ids_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName()); + Collection internal_urls_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName()); Integer external_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer exthosts_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); Integer hostextc_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName()); - ReferenceContainer references; try { - int all = 0, internal = 0, external = 0; - references = segment.urlCitation().get(url.hash(), null); - if (references == null) return false; // no references at all - //int references = segment.urlCitation().count(url.hash()); - byte[] uh0 = url.hash(); - Iterator ri = references.entries(); - HandleSet exthosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0); - while (ri.hasNext()) { - CitationReference ref = ri.next(); - byte[] hh = ref.hosthash(); - exthosts.put(hh); - all++; - if (ByteBuffer.equals(hh, 0, uh0, 6, 6)) internal++; else external++; + ReferenceReport rr = rrCache.getReferenceReport(url.hash(), false); + List internalIDs = new ArrayList(); + HandleSet iids = rr.getInternallIDs(); + for (byte[] b: iids) internalIDs.add(ASCII.String(b)); + List internalURLs = new ArrayList(); + if (this.contains(CollectionSchema.references_internal_url_sxt)) { + // get all urls from the index and store them here + for (String id: internalIDs) { + DigestURI u = fulltext.getURL(ASCII.getBytes(id)); + if (u != null) internalURLs.add(u.toNormalform(true)); + } } boolean change = false; - if (all_old == null || all_old.intValue() != all) { + int all = rr.getExternalCount() + rr.getInternalCount(); + if (this.contains(CollectionSchema.references_i) && + (all_old == null || all_old.intValue() != all)) { sid.setField(CollectionSchema.references_i.getSolrFieldName(), all); change = true; } - if (internal_old == null || internal_old.intValue() != internal) { - sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), internal); + if (this.contains(CollectionSchema.references_internal_i) && + (internal_old == null || internal_old.intValue() != rr.getInternalCount())) { + sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), rr.getInternalCount()); change = true; } - if (external_old == null || external_old.intValue() != external) { - sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), external); + if (this.contains(CollectionSchema.references_internal_id_sxt) && + (internal_ids_old == null || internal_ids_old.size() != internalIDs.size())) { + sid.setField(CollectionSchema.references_internal_id_sxt.getSolrFieldName(), internalIDs); change = true; } - if (exthosts_old == null || exthosts_old.intValue() != exthosts.size()) { - sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), exthosts.size()); + if (this.contains(CollectionSchema.references_internal_url_sxt) && + (internal_urls_old == null || internal_urls_old.size() != internalURLs.size())) { + sid.setField(CollectionSchema.references_internal_url_sxt.getSolrFieldName(), internalURLs); + change = true; + } + if (this.contains(CollectionSchema.references_external_i) && + (external_old == null || external_old.intValue() != rr.getExternalCount())) { + sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), rr.getExternalCount()); + change = true; + } + if (this.contains(CollectionSchema.references_exthosts_i) && + (exthosts_old == null || exthosts_old.intValue() != rr.getExternalHostIDs().size())) { + sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), rr.getExternalHostIDs().size()); change = true; } Long hostExtent = hostExtentCount == null ? Integer.MAX_VALUE : hostExtentCount.get(url.hosthash()); - if (hostextc_old == null || hostextc_old.intValue() != hostExtent) { + if (this.contains(CollectionSchema.host_extent_i) && + (hostextc_old == null || hostextc_old.intValue() != hostExtent)) { sid.setField(CollectionSchema.host_extent_i.getSolrFieldName(), hostExtent.intValue()); change = true; } return change; } catch (IOException e) { - } catch (SpaceExceededException e) { } return false; } diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index 80593ea77..237d015f3 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -321,7 +321,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { ReversibleScoreMap result = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); List values = facet.getValues(); if (values == null) continue; - for (Count ff: values) result.set(ff.getName(), (int) ff.getCount()); + for (Count ff: values) if (ff.getCount() > 0) result.set(ff.getName(), (int) ff.getCount()); facets.put(field, result); } return facets; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 1a7fbecfc..13d8c77d1 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -186,9 +186,9 @@ public class URIMetadataNode { } public byte[] referrerHash() { - ArrayList referrer = getStringList(CollectionSchema.referrer_id_txt); - if (referrer == null || referrer.size() == 0) return null; - return ASCII.getBytes(referrer.get(0)); + String referrer = getString(CollectionSchema.referrer_id_s); + if (referrer == null || referrer.length() == 0) return null; + return ASCII.getBytes(referrer); } public int size() { diff --git a/source/net/yacy/kelondro/workflow/WorkflowProcessor.java b/source/net/yacy/kelondro/workflow/WorkflowProcessor.java index ecb4f6d5f..31c61b4a6 100644 --- a/source/net/yacy/kelondro/workflow/WorkflowProcessor.java +++ b/source/net/yacy/kelondro/workflow/WorkflowProcessor.java @@ -234,7 +234,10 @@ public class WorkflowProcessor { // wait for shutdown try { this.executor.shutdown(); - this.executor.awaitTermination(60, TimeUnit.SECONDS); + for (int i = 0; i < 60; i++) { + this.executor.awaitTermination(1, TimeUnit.SECONDS); + if (this.input.size() <= 0) break; + } } catch (final InterruptedException e) {} } Log.logInfo("serverProcessor", "queue " + this.processName + ": shutdown."); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 645b70786..447f96532 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -35,6 +35,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.Set; +import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import org.apache.solr.common.SolrDocument; @@ -45,6 +46,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; +import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; @@ -82,6 +84,7 @@ import net.yacy.search.query.SearchEvent; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.WebgraphConfiguration; +import net.yacy.search.schema.WebgraphSchema; public class Segment { @@ -278,6 +281,108 @@ public class Segment { return 999; } + public ReferenceReportCache getReferenceReportCache() { + return new ReferenceReportCache(); + } + + public class ReferenceReportCache { + Map cache; + public ReferenceReportCache() { + this.cache = new TreeMap(Base64Order.enhancedCoder); + } + public ReferenceReport getReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException { + ReferenceReport rr = cache.get(id); + if (rr != null) return rr; + try { + rr = new ReferenceReport(id, acceptSelfReference); + cache.put(id, rr); + return rr; + } catch (SpaceExceededException e) { + Log.logException(e); + throw new IOException(e.getMessage()); + } + } + } + + /** + * A ReferenceReport object is a container for all referenced to a specific url. + * The class stores the number of links from domain-internal and domain-external backlinks, + * and the host hashes of all externally linking documents, + * all IDs from external hosts and all IDs from the same domain. + */ + public final class ReferenceReport { + private int internal, external; + private HandleSet externalHosts, externalIDs, internalIDs; + public ReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException, SpaceExceededException { + this.internal = 0; + this.external = 0; + this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0); + this.internalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0); + this.externalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0); + if (writeToWebgraph()) { + // reqd the references from the webgraph + SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector(); + webgraph.commit(true); + BlockingQueue docs = webgraph.concurrentDocumentsByQuery(WebgraphSchema.target_id_s.getSolrFieldName() + ":\"" + ASCII.String(id) + "\"", 0, 10000000, 600000, 100, WebgraphSchema.source_id_s.getSolrFieldName()); + SolrDocument doc; + try { + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + String refid = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); + if (refid == null) continue; + byte[] refidh = ASCII.getBytes(refid); + byte[] hh = new byte[6]; // host hash + System.arraycopy(refidh, 6, hh, 0, 6); + if (ByteBuffer.equals(hh, 0, id, 6, 6)) { + if (acceptSelfReference || !ByteBuffer.equals(refidh, id)) { + internalIDs.put(refidh); + internal++; + } + } else { + externalHosts.put(hh); + externalIDs.put(refidh); + external++; + } + } + } catch (InterruptedException e) { + Log.logException(e); + } + } else { + // read the references from the citation index + ReferenceContainer references; + references = urlCitation().get(id, null); + if (references == null) return; // no references at all + Iterator ri = references.entries(); + while (ri.hasNext()) { + CitationReference ref = ri.next(); + byte[] hh = ref.hosthash(); // host hash + if (ByteBuffer.equals(hh, 0, id, 6, 6)) { + internalIDs.put(ref.urlhash()); + internal++; + } else { + externalHosts.put(hh); + externalIDs.put(ref.urlhash()); + external++; + } + } + } + } + public int getInternalCount() { + return this.internal; + } + public int getExternalCount() { + return this.external; + } + public HandleSet getExternalHostIDs() { + return this.externalHosts; + } + public HandleSet getExternalIDs() { + return this.externalIDs; + } + public HandleSet getInternallIDs() { + return this.internalIDs; + } + } + public long RWICount() { if (this.termIndex == null) return 0; return this.termIndex.sizesMax(); @@ -598,7 +703,7 @@ public class Segment { // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.connectedCitation()) { - this.fulltext.getDefaultConfiguration().postprocessing_references(this, null, vector, url, null); + this.fulltext.getDefaultConfiguration().postprocessing_references(this.fulltext, this.getReferenceReportCache(), null, vector, url, null); } // STORE TO SOLR String error = null; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index ca113a892..418657f46 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -39,6 +39,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import net.yacy.cora.document.ASCII; @@ -52,10 +53,15 @@ import net.yacy.cora.federate.solr.ProcessType; import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.sorting.ClusteredScoreMap; +import net.yacy.cora.sorting.ReversibleScoreMap; +import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.CommonPattern; +import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -64,10 +70,13 @@ import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.index.RowHandleMap; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.util.Bitfield; import net.yacy.search.index.Segment; +import net.yacy.search.index.Segment.ReferenceReport; +import net.yacy.search.index.Segment.ReferenceReportCache; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; @@ -87,7 +96,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param configurationFile * @throws IOException */ - public CollectionConfiguration(final File configurationFile, boolean lazy) throws IOException { + public CollectionConfiguration(final File configurationFile, final boolean lazy) throws IOException { super(configurationFile); super.lazy = lazy; this.rankings = new ArrayList(4); @@ -115,11 +124,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } } - public Ranking getRanking(int idx) { + public Ranking getRanking(final int idx) { return this.rankings.get(idx); } - public Ranking getRanking(String name) { + public Ranking getRanking(final String name) { if (name == null) return null; for (int i = 0; i < this.rankings.size(); i++) { Ranking r = this.rankings.get(i); @@ -163,7 +172,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param doc the solr document * @return a solr input document */ - public SolrInputDocument toSolrInputDocument(SolrDocument doc) { + public SolrInputDocument toSolrInputDocument(final SolrDocument doc) { SolrInputDocument sid = new SolrInputDocument(); for (String name: doc.getFieldNames()) { if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema @@ -173,7 +182,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return sid; } - public SolrDocument toSolrDocument(SolrInputDocument doc) { + public SolrDocument toSolrDocument(final SolrInputDocument doc) { SolrDocument sd = new SolrDocument(); for (SolrInputField field: doc) { if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema @@ -280,7 +289,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate()); if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate()); if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, md.hosthash()); - if ((allAttr || contains(CollectionSchema.referrer_id_txt)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_txt, new String[]{ASCII.String(md.referrerHash())}); + if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash())); if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5()); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher()); if ((allAttr || contains(CollectionSchema.language_s)) && md.language() != null) add(doc, CollectionSchema.language_s, UTF8.String(md.language())); @@ -328,9 +337,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public SolrVector yacy2solr( final String id, final String[] collections, final ResponseHeader responseHeader, - final Document document, Condenser condenser, DigestURI referrerURL, String language, - IndexCell citations, - WebgraphConfiguration webgraph) { + final Document document, final Condenser condenser, final DigestURI referrerURL, final String language, + final IndexCell citations, + final WebgraphConfiguration webgraph) { // we use the SolrCell design as index schema SolrVector doc = new SolrVector(); final DigestURI digestURI = document.dc_source(); @@ -356,6 +365,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index } + if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) { + processTypes.add(ProcessType.CITATION); // postprocessing needed + } + if (allAttr || contains(CollectionSchema.ip_s)) { final InetAddress address = digestURI.getInetAddress(); if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress()); @@ -778,7 +791,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate); if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, document.dc_source().hosthash()); - if ((allAttr || contains(CollectionSchema.referrer_id_txt)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_txt, new String[]{ASCII.String(referrerURL.hash())}); + if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash())); //if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher()); if ((allAttr || contains(CollectionSchema.language_s)) && language != null) add(doc, CollectionSchema.language_s, language); @@ -812,60 +825,264 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param urlCitation * @return */ - public void postprocessing(Segment segment) { + public void postprocessing(final Segment segment) { if (!this.contains(CollectionSchema.process_sxt)) return; if (!segment.connectedCitation()) return; SolrConnector connector = segment.fulltext().getDefaultConnector(); - // that means we must search for those entries. connector.commit(true); // make sure that we have latest information that can be found - //BlockingQueue docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10); - BlockingQueue docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50); + ReferenceReportCache rrCache = segment.getReferenceReportCache(); + Map ranking = new TreeMap(Base64Order.enhancedCoder); + try { + // collect hosts from index which shall take part in citation computation + ReversibleScoreMap hostscore = connector.getFacets(CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(), 10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); + if (hostscore == null) hostscore = new ClusteredScoreMap(); + // for each host, do a citation rank computation + for (String host: hostscore.keyList(true)) { + if (hostscore.get(host) <= 0) continue; + // select all documents for each host + CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6); + int convergence_attempts = 0; + while (convergence_attempts++ < 30) { + if (crh.convergenceStep()) break; + } + Log.logInfo("CollectionConfiguration.CRHost", "convergence for host " + host + " after " + convergence_attempts + " steps"); + // we have now the cr for all documents of a specific host; we store them for later use + Map crn = crh.normalize(); + crh.log(crn); + ranking.putAll(crn); // accumulate this here for usage in document update later + } + } catch (IOException e2) { + } + // process all documents + BlockingQueue docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50); SolrDocument doc; - int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0; - - Map hostExtentCache = new HashMap(); + int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0; + Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id try { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { // for each to-be-processed entry work on the process tag Collection proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName()); - for (Object tag: proctags) { + + try { + DigestURI url = new DigestURI((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()))); + byte[] id = url.hash(); + SolrInputDocument sid = this.toSolrInputDocument(doc); - try { - DigestURI url = new DigestURI((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()))); - SolrInputDocument sid = this.toSolrInputDocument(doc); + for (Object tag: proctags) { // switch over tag types ProcessType tagtype = ProcessType.valueOf((String) tag); if (tagtype == ProcessType.CLICKDEPTH) { if (postprocessing_clickdepth(segment, doc, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++; } - - // refresh the link count; it's 'cheap' to do this here - String hosthash = url.hosthash(); - if (!hostExtentCache.containsKey(hosthash)) { - StringBuilder q = new StringBuilder(); - q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200"); - long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); - hostExtentCache.put(hosthash, count); + + if (tagtype == ProcessType.CITATION) { + CRV crv = ranking.get(id); + if (crv != null) { + sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count); + sid.setField(CollectionSchema.cr_host_chance_d.getSolrFieldName(), crv.cr); + sid.setField(CollectionSchema.cr_host_norm_i.getSolrFieldName(), crv.crn); + proccount_citationchange++; + } } - if (postprocessing_references(segment, doc, sid, url, hostExtentCache)) proccount_referencechange++; - - // all processing steps checked, remove the processing tag - sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); - // send back to index - connector.add(sid); - proccount++; - } catch (Throwable e1) { } + // refresh the link count; it's 'cheap' to do this here + String hosthash = url.hosthash(); + if (!hostExtentCache.containsKey(hosthash)) { + StringBuilder q = new StringBuilder(); + q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200"); + long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); + hostExtentCache.put(hosthash, count); + } + if (postprocessing_references(segment.fulltext(), rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++; + + // all processing steps checked, remove the processing tag + sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); + + // send back to index + //connector.deleteById(ASCII.String(id)); + connector.add(sid); + proccount++; + } catch (Throwable e1) { } } - Log.logInfo("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + proccount_clickdepthchange + " clickdepth values changed, " + proccount_referencechange + " reference-count values changed."); + Log.logInfo("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + + proccount_clickdepthchange + " clickdepth changes, " + + proccount_referencechange + " reference-count changes," + + proccount_citationchange + " citation ranking changes."); } catch (InterruptedException e) { } } + + private static final class CRV { + public double cr; + public int crn, count; + public CRV(final int count, final double cr, final int crn) {this.count = count; this.cr = cr; this.crn = crn;} + public String toString() { + return "count=" + count + ", cr=" + cr + ", crn=" + crn; + } + } + + /** + * The CRHost class is a container for all ranking values of a specific host. + * Objects of that class are needed as an environment for repeated convergenceStep() computations, + * which are iterative citation rank computations that are repeated until the ranking values + * converge to stable values. + * The class also contains normalization methods to compute simple integer ranking values out of the + * double relevance values. + */ + private static final class CRHost { + private final Segment segment; + private final Map crt; + private final int cr_host_count; + private final RowHandleMap internal_links_counter; + private double damping; + private int converge_eq_factor; + private ReferenceReportCache rrCache; + public CRHost(final Segment segment, final ReferenceReportCache rrCache, final String host, final double damping, final int converge_digits) { + this.segment = segment; + this.damping = damping; + this.rrCache = rrCache; + this.converge_eq_factor = (int) Math.pow(10.0d, converge_digits); + SolrConnector connector = segment.fulltext().getDefaultConnector(); + this.crt = new TreeMap(Base64Order.enhancedCoder); + try { + // select all documents for each host + BlockingQueue ids = connector.concurrentIDsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"", 0, 1000000, 600000); + String id; + while ((id = ids.take()) != AbstractSolrConnector.POISON_ID) { + crt.put(ASCII.getBytes(id), new double[]{0.0d,0.0d}); //{old value, new value} + } + } catch (InterruptedException e2) { + } + this.cr_host_count = crt.size(); + double initval = 1.0d / cr_host_count; + for (Map.Entry entry: this.crt.entrySet()) entry.getValue()[0] = initval; + this.internal_links_counter = new RowHandleMap(12, Base64Order.enhancedCoder, 8, 100, "internal_links_counter"); + } + /** + * produce a map from IDs to CRV records, normalization entries containing the values that are stored to solr. + * @return + */ + public Map normalize() { + TreeMap> reorder = new TreeMap>(); + for (Map.Entry entry: crt.entrySet()) { + Double d = entry.getValue()[0]; + List ds = reorder.get(d); + if (ds == null) {ds = new ArrayList(); reorder.put(d, ds);} + ds.add(entry.getKey()); + } + int nextcount = (this.cr_host_count + 1) / 2; + int nextcrn = 0; + Map r = new TreeMap(Base64Order.enhancedCoder); + while (reorder.size() > 0) { + int count = nextcount; + while (reorder.size() > 0 && count > 0) { + Map.Entry> next = reorder.pollFirstEntry(); + List ids = next.getValue(); + count -= ids.size(); + double cr = next.getKey(); + for (byte[] id: ids) r.put(id, new CRV(this.cr_host_count, cr, nextcrn)); + } + nextcrn++; + nextcount = Math.max(1, (nextcount + count + 1) / 2); + } + // finally, increase the crn number in such a way that the maximum is always 10 + int inc = 11 - nextcrn; // nextcrn is +1 + for (Map.Entry entry: r.entrySet()) entry.getValue().crn += inc; + return r; + } + /** + * log out a complete CRHost set of urls and ranking values + * @param rm + */ + public void log(final Map rm) { + // print out all urls with their cr-values + SolrConnector connector = segment.fulltext().getDefaultConnector(); + for (Map.Entry entry: rm.entrySet()) { + try { + String url = (String) connector.getDocumentById(ASCII.String(entry.getKey()), CollectionSchema.sku.getSolrFieldName()).getFieldValue(CollectionSchema.sku.getSolrFieldName()); + Log.logInfo("CollectionConfiguration.CRHost", "CR for " + url); + Log.logInfo("CollectionConfiguration.CRHost", ">> " + entry.getValue().toString()); + } catch (IOException e) { + Log.logException(e); + } + } + } + /** + * Calculate the number of internal links from a specific document, denoted by the document ID. + * This is a very important attribute for the ranking computation because it is the dividend for the previous ranking attribute. + * The internalLinks value will be requested several times for the same id during the convergenceStep()-steps; therefore it should use a cache. + * This cache is part of the CRHost data structure. + * @param id + * @return the number of links from the document, denoted by the ID to documents within the same domain + */ + public int getInternalLinks(final byte[] id) { + int il = (int) this.internal_links_counter.get(id); + if (il >= 0) return il; + try { + SolrDocument doc = this.segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(id), CollectionSchema.inboundlinkscount_i.getSolrFieldName()); + if (doc == null) { + this.internal_links_counter.put(id, 0); + return 0; + } + Object x = doc.getFieldValue(CollectionSchema.inboundlinkscount_i.getSolrFieldName()); + il = (x == null) ? 0 : (x instanceof Integer) ? ((Integer) x).intValue() : (x instanceof Long) ? ((Long) x).intValue() : 0; + this.internal_links_counter.put(id, il); + return il; + } catch (IOException e) { + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); + } + try {this.internal_links_counter.put(id, 0);} catch (SpaceExceededException e) {} + return 0; + } + /** + * Use the crt cache to compute the next generation of crt values. + * @return + */ + public boolean convergenceStep() { + boolean convergence = true; + double df = (1.0d - damping) / this.cr_host_count; + try { + for (Map.Entry entry: crt.entrySet()) { + byte[] id = entry.getKey(); + ReferenceReport rr = this.rrCache.getReferenceReport(id, false); + // sum up the cr of the internal links + HandleSet iids = rr.getInternallIDs(); + double ncr = 0.0d; + for (byte[] iid: iids) { + int ilc = getInternalLinks(iid); + if (ilc > 0) { // if (ilc == 0) then the reference report is wrong! + ncr += this.crt.get(iid)[0] / ilc; + } + } + ncr = df + damping * ncr; + if (convergence && !eqd(ncr, entry.getValue()[0])) convergence = false; + entry.getValue()[1] = ncr; + } + // after the loop, replace the old value with the new value in crt + for (Map.Entry entry: crt.entrySet()) { + entry.getValue()[0] = entry.getValue()[1]; + } + } catch (IOException e) { + } + return convergence; + } + /** + * helper method to check if two doubles are equal using a specific number of digits + * @param a + * @param b + * @return + */ + private boolean eqd(final double a, final double b) { + return ((int) (a * this.converge_eq_factor)) == ((int) (b * this.converge_eq_factor)); + } + } /** * this method compresses a list of protocol names to an indexed list. @@ -876,7 +1093,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param protocol * @return a list of indexed protocol entries */ - private static List protocolList2indexedList(List protocol) { + private static List protocolList2indexedList(final List protocol) { List a = new ArrayList(); String p; for (int i = 0; i < protocol.size(); i++) { diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 129f862cf..5681915c9 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -51,7 +51,9 @@ public enum CollectionSchema implements SchemaDeclaration { httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i"), - references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host as referenced url"), + references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"), + references_internal_id_sxt(SolrType.string, true, true, true, false, true, "ids of unique http references from same host to referenced url"), + references_internal_url_sxt(SolrType.string, true, true, true, false, true, "urls of unique http references from same host to referenced url"), references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"), references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"), clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"), @@ -60,7 +62,7 @@ public enum CollectionSchema implements SchemaDeclaration { // optional but recommended, part of index distribution load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"), fresh_date_dt(SolrType.date, true, true, false, false, false, "date until resource shall be considered as fresh"), - referrer_id_txt(SolrType.string, true, true, true, false, false, "ids of referrer to this document"),// byte[] referrerHash(); + referrer_id_s(SolrType.string, true, true, false, false, false, "id of the referrer to this document, discovered during crawling"),// byte[] referrerHash(); publisher_t(SolrType.text_general, true, true, false, false, true, "the name of the publisher of the document"),// String dc_publisher(); language_s(SolrType.string, true, true, false, false, false, "the language used in the document"),// byte[] language(); audiolinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to audio resources"),// int laudio(); @@ -184,6 +186,11 @@ public enum CollectionSchema implements SchemaDeclaration { opengraph_url_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"), opengraph_image_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"), + // link structure for ranking + cr_host_count_i(SolrType.num_integer, true, true, false, false, false, "the number of documents within a single host"), + cr_host_chance_d(SolrType.num_double, true, true, false, false, false, "the chance to click on this page when randomly clicking on links within on one host"), + cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10"), + // special values; can only be used if '_val' type is defined in schema file; this is not standard bold_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in bold_txt"), italic_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in italic_txt"),