diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 7b7a5f030..23c6d5c66 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -69,9 +69,15 @@ httpstatus_i ## number of unique http references, should be equal to references_internal_i + references_external_i references_i -## number of unique http references from same host as referenced url +## number of unique http references from same host to referenced url references_internal_i +## ids of unique http references from same host to referenced url +#references_internal_id_sxt + +## urls of unique http references from same host to referenced url +#references_internal_url_sxt + ## number of unique http references from external hosts references_external_i @@ -93,8 +99,8 @@ load_date_dt ## date until resource shall be considered as fresh fresh_date_dt -## ids of referrer to this document -referrer_id_txt +## id of the referrer to this document, discovered during crawling +referrer_id_s ## the name of the publisher of the document publisher_t @@ -396,6 +402,19 @@ host_extent_i #opengraph_url_s #opengraph_image_s + +## citation ranking + +## the number of documents within a single host +#cr_host_count_i + +## the chance to click on this page when randomly clicking on links within on one host +#cr_host_chance_d + +## normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10 +#cr_host_norm_i + + ## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias #ext_cms_txt diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 64d508970..aef6e5523 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; +import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -273,8 +274,12 @@ public class HostBrowser { CollectionSchema.clickdepth_i.getSolrFieldName(), CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(), + CollectionSchema.references_internal_id_sxt.getSolrFieldName(), + CollectionSchema.references_internal_url_sxt.getSolrFieldName(), CollectionSchema.references_external_i.getSolrFieldName(), - CollectionSchema.references_exthosts_i.getSolrFieldName() + CollectionSchema.references_exthosts_i.getSolrFieldName(), + CollectionSchema.cr_host_chance_d.getSolrFieldName(), + CollectionSchema.cr_host_norm_i.getSolrFieldName() ); SolrDocument doc; Set storedDocs = new HashSet(); @@ -290,7 +295,7 @@ public class HostBrowser { String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); FailType error = errortype == null ? null : FailType.valueOf(errortype); String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - infoCache.put(ids, new InfoCacheEntry(doc)); + infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), doc)); if (u.startsWith(path)) { if (delete) { deleteIDs.add(ids); @@ -425,9 +430,7 @@ public class HostBrowser { if (type == StoreType.INDEX) { String ids = ASCII.String(uri.hash()); InfoCacheEntry ice = infoCache.get(ids); - prop.put("files_list_" + c + "_type_stored_comment", - (ice.references >= 0 ? "refs: " + ice.references_internal + " int, " + ice.references_external + " ext, " + ice.references_exthosts + " hosts" : "") + - (ice.references >= 0 && ice.clickdepth >= 0 ? ", " : "") + (ice.clickdepth >= 0 ? "clickdepth: " + ice.clickdepth : "")); + prop.put("files_list_" + c + "_type_stored_comment", ice.toString()); // ice.toString() contains html, therefore do not use putHTML here } prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0); if (error) { @@ -505,19 +508,52 @@ public class HostBrowser { } public static final class InfoCacheEntry { + public Integer cr_n; + public Double cr_c; public int clickdepth, references, references_internal, references_external, references_exthosts; - public InfoCacheEntry(final SolrDocument doc) { + public List references_internal_urls; + private final Fulltext fulltext; + public InfoCacheEntry(final Fulltext fulltext, final SolrDocument doc) { + this.fulltext = fulltext; + this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName()); + this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName()); Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName()); Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); + Collection rc_internal_id = doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName()); + Collection rc_internal_url = doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName()); Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue(); this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue(); this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue(); + // calculate the url reference list + this.references_internal_urls = new ArrayList(); + if (rc_internal_url != null) { + for (Object o: rc_internal_url) references_internal_urls.add((String) o); + } else if (rc_internal_id != null) { + for (Object o: rc_internal_id) { + DigestURI u = fulltext.getURL(ASCII.getBytes((String) o)); + if (u != null) references_internal_urls.add(u.toNormalform(true)); + } + } this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue(); this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue(); } + public String toString() { + StringBuilder sb = new StringBuilder(); + for (String s: references_internal_urls) sb.append("info"); + if (sb.length() == 0 && !fulltext.getDefaultConfiguration().contains(CollectionSchema.references_internal_id_sxt)) sb.append("info"); + return + (this.clickdepth >= 0 ? + "clickdepth: " + this.clickdepth : + "") + + (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") + + (this.cr_n != null ? ", crn=" + this.cr_n : "") + + (this.references >= 0 ? + ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + (sb.length() > 0 ? " " + sb.toString() + "" : "") : + ""); + } } } diff --git a/source/net/yacy/cora/federate/solr/ProcessType.java b/source/net/yacy/cora/federate/solr/ProcessType.java index 29365708a..cef9d1338 100644 --- a/source/net/yacy/cora/federate/solr/ProcessType.java +++ b/source/net/yacy/cora/federate/solr/ProcessType.java @@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr; */ public enum ProcessType { - CLICKDEPTH; + CLICKDEPTH, CITATION; } diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 14387752e..399562c5d 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -23,6 +23,8 @@ package net.yacy.cora.federate.solr; import java.io.File; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; @@ -32,16 +34,14 @@ import org.apache.log4j.Logger; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; -import net.yacy.cora.order.Base64Order; +import net.yacy.cora.document.ASCII; import net.yacy.cora.storage.Configuration; import net.yacy.cora.storage.HandleSet; -import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.index.RowHandleSet; -import net.yacy.kelondro.rwi.ReferenceContainer; -import net.yacy.kelondro.util.ByteBuffer; +import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment; +import net.yacy.search.index.Segment.ReferenceReport; +import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.schema.CollectionSchema; public class SchemaConfiguration extends Configuration implements Serializable { @@ -94,56 +94,72 @@ public class SchemaConfiguration extends Configuration implements Serializable { return false; } - public boolean postprocessing_references(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map hostExtentCount) { - if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) || + public boolean postprocessing_references(Fulltext fulltext, ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map hostExtentCount) { + if (!(this.contains(CollectionSchema.references_i) || + this.contains(CollectionSchema.references_internal_i) || + this.contains(CollectionSchema.references_internal_id_sxt) || this.contains(CollectionSchema.references_internal_url_sxt) || this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false; Integer all_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer internal_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); + Collection internal_ids_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName()); + Collection internal_urls_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName()); Integer external_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer exthosts_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); Integer hostextc_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName()); - ReferenceContainer references; try { - int all = 0, internal = 0, external = 0; - references = segment.urlCitation().get(url.hash(), null); - if (references == null) return false; // no references at all - //int references = segment.urlCitation().count(url.hash()); - byte[] uh0 = url.hash(); - Iterator ri = references.entries(); - HandleSet exthosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0); - while (ri.hasNext()) { - CitationReference ref = ri.next(); - byte[] hh = ref.hosthash(); - exthosts.put(hh); - all++; - if (ByteBuffer.equals(hh, 0, uh0, 6, 6)) internal++; else external++; + ReferenceReport rr = rrCache.getReferenceReport(url.hash(), false); + List internalIDs = new ArrayList(); + HandleSet iids = rr.getInternallIDs(); + for (byte[] b: iids) internalIDs.add(ASCII.String(b)); + List internalURLs = new ArrayList(); + if (this.contains(CollectionSchema.references_internal_url_sxt)) { + // get all urls from the index and store them here + for (String id: internalIDs) { + DigestURI u = fulltext.getURL(ASCII.getBytes(id)); + if (u != null) internalURLs.add(u.toNormalform(true)); + } } boolean change = false; - if (all_old == null || all_old.intValue() != all) { + int all = rr.getExternalCount() + rr.getInternalCount(); + if (this.contains(CollectionSchema.references_i) && + (all_old == null || all_old.intValue() != all)) { sid.setField(CollectionSchema.references_i.getSolrFieldName(), all); change = true; } - if (internal_old == null || internal_old.intValue() != internal) { - sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), internal); + if (this.contains(CollectionSchema.references_internal_i) && + (internal_old == null || internal_old.intValue() != rr.getInternalCount())) { + sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), rr.getInternalCount()); change = true; } - if (external_old == null || external_old.intValue() != external) { - sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), external); + if (this.contains(CollectionSchema.references_internal_id_sxt) && + (internal_ids_old == null || internal_ids_old.size() != internalIDs.size())) { + sid.setField(CollectionSchema.references_internal_id_sxt.getSolrFieldName(), internalIDs); change = true; } - if (exthosts_old == null || exthosts_old.intValue() != exthosts.size()) { - sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), exthosts.size()); + if (this.contains(CollectionSchema.references_internal_url_sxt) && + (internal_urls_old == null || internal_urls_old.size() != internalURLs.size())) { + sid.setField(CollectionSchema.references_internal_url_sxt.getSolrFieldName(), internalURLs); + change = true; + } + if (this.contains(CollectionSchema.references_external_i) && + (external_old == null || external_old.intValue() != rr.getExternalCount())) { + sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), rr.getExternalCount()); + change = true; + } + if (this.contains(CollectionSchema.references_exthosts_i) && + (exthosts_old == null || exthosts_old.intValue() != rr.getExternalHostIDs().size())) { + sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), rr.getExternalHostIDs().size()); change = true; } Long hostExtent = hostExtentCount == null ? Integer.MAX_VALUE : hostExtentCount.get(url.hosthash()); - if (hostextc_old == null || hostextc_old.intValue() != hostExtent) { + if (this.contains(CollectionSchema.host_extent_i) && + (hostextc_old == null || hostextc_old.intValue() != hostExtent)) { sid.setField(CollectionSchema.host_extent_i.getSolrFieldName(), hostExtent.intValue()); change = true; } return change; } catch (IOException e) { - } catch (SpaceExceededException e) { } return false; } diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index 80593ea77..237d015f3 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -321,7 +321,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { ReversibleScoreMap result = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); List values = facet.getValues(); if (values == null) continue; - for (Count ff: values) result.set(ff.getName(), (int) ff.getCount()); + for (Count ff: values) if (ff.getCount() > 0) result.set(ff.getName(), (int) ff.getCount()); facets.put(field, result); } return facets; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 1a7fbecfc..13d8c77d1 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -186,9 +186,9 @@ public class URIMetadataNode { } public byte[] referrerHash() { - ArrayList referrer = getStringList(CollectionSchema.referrer_id_txt); - if (referrer == null || referrer.size() == 0) return null; - return ASCII.getBytes(referrer.get(0)); + String referrer = getString(CollectionSchema.referrer_id_s); + if (referrer == null || referrer.length() == 0) return null; + return ASCII.getBytes(referrer); } public int size() { diff --git a/source/net/yacy/kelondro/workflow/WorkflowProcessor.java b/source/net/yacy/kelondro/workflow/WorkflowProcessor.java index ecb4f6d5f..31c61b4a6 100644 --- a/source/net/yacy/kelondro/workflow/WorkflowProcessor.java +++ b/source/net/yacy/kelondro/workflow/WorkflowProcessor.java @@ -234,7 +234,10 @@ public class WorkflowProcessor { // wait for shutdown try { this.executor.shutdown(); - this.executor.awaitTermination(60, TimeUnit.SECONDS); + for (int i = 0; i < 60; i++) { + this.executor.awaitTermination(1, TimeUnit.SECONDS); + if (this.input.size() <= 0) break; + } } catch (final InterruptedException e) {} } Log.logInfo("serverProcessor", "queue " + this.processName + ": shutdown."); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 645b70786..447f96532 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -35,6 +35,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.Set; +import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import org.apache.solr.common.SolrDocument; @@ -45,6 +46,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; +import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; @@ -82,6 +84,7 @@ import net.yacy.search.query.SearchEvent; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.WebgraphConfiguration; +import net.yacy.search.schema.WebgraphSchema; public class Segment { @@ -278,6 +281,108 @@ public class Segment { return 999; } + public ReferenceReportCache getReferenceReportCache() { + return new ReferenceReportCache(); + } + + public class ReferenceReportCache { + Map cache; + public ReferenceReportCache() { + this.cache = new TreeMap(Base64Order.enhancedCoder); + } + public ReferenceReport getReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException { + ReferenceReport rr = cache.get(id); + if (rr != null) return rr; + try { + rr = new ReferenceReport(id, acceptSelfReference); + cache.put(id, rr); + return rr; + } catch (SpaceExceededException e) { + Log.logException(e); + throw new IOException(e.getMessage()); + } + } + } + + /** + * A ReferenceReport object is a container for all referenced to a specific url. + * The class stores the number of links from domain-internal and domain-external backlinks, + * and the host hashes of all externally linking documents, + * all IDs from external hosts and all IDs from the same domain. + */ + public final class ReferenceReport { + private int internal, external; + private HandleSet externalHosts, externalIDs, internalIDs; + public ReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException, SpaceExceededException { + this.internal = 0; + this.external = 0; + this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0); + this.internalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0); + this.externalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0); + if (writeToWebgraph()) { + // reqd the references from the webgraph + SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector(); + webgraph.commit(true); + BlockingQueue docs = webgraph.concurrentDocumentsByQuery(WebgraphSchema.target_id_s.getSolrFieldName() + ":\"" + ASCII.String(id) + "\"", 0, 10000000, 600000, 100, WebgraphSchema.source_id_s.getSolrFieldName()); + SolrDocument doc; + try { + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + String refid = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); + if (refid == null) continue; + byte[] refidh = ASCII.getBytes(refid); + byte[] hh = new byte[6]; // host hash + System.arraycopy(refidh, 6, hh, 0, 6); + if (ByteBuffer.equals(hh, 0, id, 6, 6)) { + if (acceptSelfReference || !ByteBuffer.equals(refidh, id)) { + internalIDs.put(refidh); + internal++; + } + } else { + externalHosts.put(hh); + externalIDs.put(refidh); + external++; + } + } + } catch (InterruptedException e) { + Log.logException(e); + } + } else { + // read the references from the citation index + ReferenceContainer references; + references = urlCitation().get(id, null); + if (references == null) return; // no references at all + Iterator ri = references.entries(); + while (ri.hasNext()) { + CitationReference ref = ri.next(); + byte[] hh = ref.hosthash(); // host hash + if (ByteBuffer.equals(hh, 0, id, 6, 6)) { + internalIDs.put(ref.urlhash()); + internal++; + } else { + externalHosts.put(hh); + externalIDs.put(ref.urlhash()); + external++; + } + } + } + } + public int getInternalCount() { + return this.internal; + } + public int getExternalCount() { + return this.external; + } + public HandleSet getExternalHostIDs() { + return this.externalHosts; + } + public HandleSet getExternalIDs() { + return this.externalIDs; + } + public HandleSet getInternallIDs() { + return this.internalIDs; + } + } + public long RWICount() { if (this.termIndex == null) return 0; return this.termIndex.sizesMax(); @@ -598,7 +703,7 @@ public class Segment { // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.connectedCitation()) { - this.fulltext.getDefaultConfiguration().postprocessing_references(this, null, vector, url, null); + this.fulltext.getDefaultConfiguration().postprocessing_references(this.fulltext, this.getReferenceReportCache(), null, vector, url, null); } // STORE TO SOLR String error = null; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index ca113a892..418657f46 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -39,6 +39,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import net.yacy.cora.document.ASCII; @@ -52,10 +53,15 @@ import net.yacy.cora.federate.solr.ProcessType; import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.sorting.ClusteredScoreMap; +import net.yacy.cora.sorting.ReversibleScoreMap; +import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.CommonPattern; +import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -64,10 +70,13 @@ import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.index.RowHandleMap; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.util.Bitfield; import net.yacy.search.index.Segment; +import net.yacy.search.index.Segment.ReferenceReport; +import net.yacy.search.index.Segment.ReferenceReportCache; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; @@ -87,7 +96,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param configurationFile * @throws IOException */ - public CollectionConfiguration(final File configurationFile, boolean lazy) throws IOException { + public CollectionConfiguration(final File configurationFile, final boolean lazy) throws IOException { super(configurationFile); super.lazy = lazy; this.rankings = new ArrayList(4); @@ -115,11 +124,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } } - public Ranking getRanking(int idx) { + public Ranking getRanking(final int idx) { return this.rankings.get(idx); } - public Ranking getRanking(String name) { + public Ranking getRanking(final String name) { if (name == null) return null; for (int i = 0; i < this.rankings.size(); i++) { Ranking r = this.rankings.get(i); @@ -163,7 +172,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param doc the solr document * @return a solr input document */ - public SolrInputDocument toSolrInputDocument(SolrDocument doc) { + public SolrInputDocument toSolrInputDocument(final SolrDocument doc) { SolrInputDocument sid = new SolrInputDocument(); for (String name: doc.getFieldNames()) { if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema @@ -173,7 +182,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return sid; } - public SolrDocument toSolrDocument(SolrInputDocument doc) { + public SolrDocument toSolrDocument(final SolrInputDocument doc) { SolrDocument sd = new SolrDocument(); for (SolrInputField field: doc) { if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema @@ -280,7 +289,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate()); if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate()); if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, md.hosthash()); - if ((allAttr || contains(CollectionSchema.referrer_id_txt)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_txt, new String[]{ASCII.String(md.referrerHash())}); + if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash())); if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5()); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher()); if ((allAttr || contains(CollectionSchema.language_s)) && md.language() != null) add(doc, CollectionSchema.language_s, UTF8.String(md.language())); @@ -328,9 +337,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public SolrVector yacy2solr( final String id, final String[] collections, final ResponseHeader responseHeader, - final Document document, Condenser condenser, DigestURI referrerURL, String language, - IndexCell citations, - WebgraphConfiguration webgraph) { + final Document document, final Condenser condenser, final DigestURI referrerURL, final String language, + final IndexCell citations, + final WebgraphConfiguration webgraph) { // we use the SolrCell design as index schema SolrVector doc = new SolrVector(); final DigestURI digestURI = document.dc_source(); @@ -356,6 +365,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index } + if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) { + processTypes.add(ProcessType.CITATION); // postprocessing needed + } + if (allAttr || contains(CollectionSchema.ip_s)) { final InetAddress address = digestURI.getInetAddress(); if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress()); @@ -778,7 +791,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate); if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, document.dc_source().hosthash()); - if ((allAttr || contains(CollectionSchema.referrer_id_txt)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_txt, new String[]{ASCII.String(referrerURL.hash())}); + if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash())); //if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher()); if ((allAttr || contains(CollectionSchema.language_s)) && language != null) add(doc, CollectionSchema.language_s, language); @@ -812,60 +825,264 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param urlCitation * @return */ - public void postprocessing(Segment segment) { + public void postprocessing(final Segment segment) { if (!this.contains(CollectionSchema.process_sxt)) return; if (!segment.connectedCitation()) return; SolrConnector connector = segment.fulltext().getDefaultConnector(); - // that means we must search for those entries. connector.commit(true); // make sure that we have latest information that can be found - //BlockingQueue docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10); - BlockingQueue docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50); + ReferenceReportCache rrCache = segment.getReferenceReportCache(); + Map ranking = new TreeMap(Base64Order.enhancedCoder); + try { + // collect hosts from index which shall take part in citation computation + ReversibleScoreMap hostscore = connector.getFacets(CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(), 10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); + if (hostscore == null) hostscore = new ClusteredScoreMap(); + // for each host, do a citation rank computation + for (String host: hostscore.keyList(true)) { + if (hostscore.get(host) <= 0) continue; + // select all documents for each host + CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6); + int convergence_attempts = 0; + while (convergence_attempts++ < 30) { + if (crh.convergenceStep()) break; + } + Log.logInfo("CollectionConfiguration.CRHost", "convergence for host " + host + " after " + convergence_attempts + " steps"); + // we have now the cr for all documents of a specific host; we store them for later use + Map crn = crh.normalize(); + crh.log(crn); + ranking.putAll(crn); // accumulate this here for usage in document update later + } + } catch (IOException e2) { + } + // process all documents + BlockingQueue docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50); SolrDocument doc; - int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0; - - Map hostExtentCache = new HashMap(); + int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0; + Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id try { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { // for each to-be-processed entry work on the process tag Collection proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName()); - for (Object tag: proctags) { + + try { + DigestURI url = new DigestURI((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()))); + byte[] id = url.hash(); + SolrInputDocument sid = this.toSolrInputDocument(doc); - try { - DigestURI url = new DigestURI((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()))); - SolrInputDocument sid = this.toSolrInputDocument(doc); + for (Object tag: proctags) { // switch over tag types ProcessType tagtype = ProcessType.valueOf((String) tag); if (tagtype == ProcessType.CLICKDEPTH) { if (postprocessing_clickdepth(segment, doc, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++; } - - // refresh the link count; it's 'cheap' to do this here - String hosthash = url.hosthash(); - if (!hostExtentCache.containsKey(hosthash)) { - StringBuilder q = new StringBuilder(); - q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200"); - long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); - hostExtentCache.put(hosthash, count); + + if (tagtype == ProcessType.CITATION) { + CRV crv = ranking.get(id); + if (crv != null) { + sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count); + sid.setField(CollectionSchema.cr_host_chance_d.getSolrFieldName(), crv.cr); + sid.setField(CollectionSchema.cr_host_norm_i.getSolrFieldName(), crv.crn); + proccount_citationchange++; + } } - if (postprocessing_references(segment, doc, sid, url, hostExtentCache)) proccount_referencechange++; - - // all processing steps checked, remove the processing tag - sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); - // send back to index - connector.add(sid); - proccount++; - } catch (Throwable e1) { } + // refresh the link count; it's 'cheap' to do this here + String hosthash = url.hosthash(); + if (!hostExtentCache.containsKey(hosthash)) { + StringBuilder q = new StringBuilder(); + q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200"); + long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); + hostExtentCache.put(hosthash, count); + } + if (postprocessing_references(segment.fulltext(), rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++; + + // all processing steps checked, remove the processing tag + sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); + + // send back to index + //connector.deleteById(ASCII.String(id)); + connector.add(sid); + proccount++; + } catch (Throwable e1) { } } - Log.logInfo("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + proccount_clickdepthchange + " clickdepth values changed, " + proccount_referencechange + " reference-count values changed."); + Log.logInfo("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + + proccount_clickdepthchange + " clickdepth changes, " + + proccount_referencechange + " reference-count changes," + + proccount_citationchange + " citation ranking changes."); } catch (InterruptedException e) { } } + + private static final class CRV { + public double cr; + public int crn, count; + public CRV(final int count, final double cr, final int crn) {this.count = count; this.cr = cr; this.crn = crn;} + public String toString() { + return "count=" + count + ", cr=" + cr + ", crn=" + crn; + } + } + + /** + * The CRHost class is a container for all ranking values of a specific host. + * Objects of that class are needed as an environment for repeated convergenceStep() computations, + * which are iterative citation rank computations that are repeated until the ranking values + * converge to stable values. + * The class also contains normalization methods to compute simple integer ranking values out of the + * double relevance values. + */ + private static final class CRHost { + private final Segment segment; + private final Map crt; + private final int cr_host_count; + private final RowHandleMap internal_links_counter; + private double damping; + private int converge_eq_factor; + private ReferenceReportCache rrCache; + public CRHost(final Segment segment, final ReferenceReportCache rrCache, final String host, final double damping, final int converge_digits) { + this.segment = segment; + this.damping = damping; + this.rrCache = rrCache; + this.converge_eq_factor = (int) Math.pow(10.0d, converge_digits); + SolrConnector connector = segment.fulltext().getDefaultConnector(); + this.crt = new TreeMap(Base64Order.enhancedCoder); + try { + // select all documents for each host + BlockingQueue ids = connector.concurrentIDsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"", 0, 1000000, 600000); + String id; + while ((id = ids.take()) != AbstractSolrConnector.POISON_ID) { + crt.put(ASCII.getBytes(id), new double[]{0.0d,0.0d}); //{old value, new value} + } + } catch (InterruptedException e2) { + } + this.cr_host_count = crt.size(); + double initval = 1.0d / cr_host_count; + for (Map.Entry entry: this.crt.entrySet()) entry.getValue()[0] = initval; + this.internal_links_counter = new RowHandleMap(12, Base64Order.enhancedCoder, 8, 100, "internal_links_counter"); + } + /** + * produce a map from IDs to CRV records, normalization entries containing the values that are stored to solr. + * @return + */ + public Map normalize() { + TreeMap> reorder = new TreeMap>(); + for (Map.Entry entry: crt.entrySet()) { + Double d = entry.getValue()[0]; + List ds = reorder.get(d); + if (ds == null) {ds = new ArrayList(); reorder.put(d, ds);} + ds.add(entry.getKey()); + } + int nextcount = (this.cr_host_count + 1) / 2; + int nextcrn = 0; + Map r = new TreeMap(Base64Order.enhancedCoder); + while (reorder.size() > 0) { + int count = nextcount; + while (reorder.size() > 0 && count > 0) { + Map.Entry> next = reorder.pollFirstEntry(); + List ids = next.getValue(); + count -= ids.size(); + double cr = next.getKey(); + for (byte[] id: ids) r.put(id, new CRV(this.cr_host_count, cr, nextcrn)); + } + nextcrn++; + nextcount = Math.max(1, (nextcount + count + 1) / 2); + } + // finally, increase the crn number in such a way that the maximum is always 10 + int inc = 11 - nextcrn; // nextcrn is +1 + for (Map.Entry entry: r.entrySet()) entry.getValue().crn += inc; + return r; + } + /** + * log out a complete CRHost set of urls and ranking values + * @param rm + */ + public void log(final Map rm) { + // print out all urls with their cr-values + SolrConnector connector = segment.fulltext().getDefaultConnector(); + for (Map.Entry entry: rm.entrySet()) { + try { + String url = (String) connector.getDocumentById(ASCII.String(entry.getKey()), CollectionSchema.sku.getSolrFieldName()).getFieldValue(CollectionSchema.sku.getSolrFieldName()); + Log.logInfo("CollectionConfiguration.CRHost", "CR for " + url); + Log.logInfo("CollectionConfiguration.CRHost", ">> " + entry.getValue().toString()); + } catch (IOException e) { + Log.logException(e); + } + } + } + /** + * Calculate the number of internal links from a specific document, denoted by the document ID. + * This is a very important attribute for the ranking computation because it is the dividend for the previous ranking attribute. + * The internalLinks value will be requested several times for the same id during the convergenceStep()-steps; therefore it should use a cache. + * This cache is part of the CRHost data structure. + * @param id + * @return the number of links from the document, denoted by the ID to documents within the same domain + */ + public int getInternalLinks(final byte[] id) { + int il = (int) this.internal_links_counter.get(id); + if (il >= 0) return il; + try { + SolrDocument doc = this.segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(id), CollectionSchema.inboundlinkscount_i.getSolrFieldName()); + if (doc == null) { + this.internal_links_counter.put(id, 0); + return 0; + } + Object x = doc.getFieldValue(CollectionSchema.inboundlinkscount_i.getSolrFieldName()); + il = (x == null) ? 0 : (x instanceof Integer) ? ((Integer) x).intValue() : (x instanceof Long) ? ((Long) x).intValue() : 0; + this.internal_links_counter.put(id, il); + return il; + } catch (IOException e) { + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); + } + try {this.internal_links_counter.put(id, 0);} catch (SpaceExceededException e) {} + return 0; + } + /** + * Use the crt cache to compute the next generation of crt values. + * @return + */ + public boolean convergenceStep() { + boolean convergence = true; + double df = (1.0d - damping) / this.cr_host_count; + try { + for (Map.Entry entry: crt.entrySet()) { + byte[] id = entry.getKey(); + ReferenceReport rr = this.rrCache.getReferenceReport(id, false); + // sum up the cr of the internal links + HandleSet iids = rr.getInternallIDs(); + double ncr = 0.0d; + for (byte[] iid: iids) { + int ilc = getInternalLinks(iid); + if (ilc > 0) { // if (ilc == 0) then the reference report is wrong! + ncr += this.crt.get(iid)[0] / ilc; + } + } + ncr = df + damping * ncr; + if (convergence && !eqd(ncr, entry.getValue()[0])) convergence = false; + entry.getValue()[1] = ncr; + } + // after the loop, replace the old value with the new value in crt + for (Map.Entry entry: crt.entrySet()) { + entry.getValue()[0] = entry.getValue()[1]; + } + } catch (IOException e) { + } + return convergence; + } + /** + * helper method to check if two doubles are equal using a specific number of digits + * @param a + * @param b + * @return + */ + private boolean eqd(final double a, final double b) { + return ((int) (a * this.converge_eq_factor)) == ((int) (b * this.converge_eq_factor)); + } + } /** * this method compresses a list of protocol names to an indexed list. @@ -876,7 +1093,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param protocol * @return a list of indexed protocol entries */ - private static List protocolList2indexedList(List protocol) { + private static List protocolList2indexedList(final List protocol) { List a = new ArrayList(); String p; for (int i = 0; i < protocol.size(); i++) { diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 129f862cf..5681915c9 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -51,7 +51,9 @@ public enum CollectionSchema implements SchemaDeclaration { httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i"), - references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host as referenced url"), + references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"), + references_internal_id_sxt(SolrType.string, true, true, true, false, true, "ids of unique http references from same host to referenced url"), + references_internal_url_sxt(SolrType.string, true, true, true, false, true, "urls of unique http references from same host to referenced url"), references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"), references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"), clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"), @@ -60,7 +62,7 @@ public enum CollectionSchema implements SchemaDeclaration { // optional but recommended, part of index distribution load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"), fresh_date_dt(SolrType.date, true, true, false, false, false, "date until resource shall be considered as fresh"), - referrer_id_txt(SolrType.string, true, true, true, false, false, "ids of referrer to this document"),// byte[] referrerHash(); + referrer_id_s(SolrType.string, true, true, false, false, false, "id of the referrer to this document, discovered during crawling"),// byte[] referrerHash(); publisher_t(SolrType.text_general, true, true, false, false, true, "the name of the publisher of the document"),// String dc_publisher(); language_s(SolrType.string, true, true, false, false, false, "the language used in the document"),// byte[] language(); audiolinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to audio resources"),// int laudio(); @@ -184,6 +186,11 @@ public enum CollectionSchema implements SchemaDeclaration { opengraph_url_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"), opengraph_image_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"), + // link structure for ranking + cr_host_count_i(SolrType.num_integer, true, true, false, false, false, "the number of documents within a single host"), + cr_host_chance_d(SolrType.num_double, true, true, false, false, false, "the chance to click on this page when randomly clicking on links within on one host"), + cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10"), + // special values; can only be used if '_val' type is defined in schema file; this is not standard bold_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in bold_txt"), italic_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in italic_txt"),