diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 9862cf8c4..decd7eaac 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -437,13 +437,13 @@ host_extent_i ## citation ranking ## the number of documents within a single host -cr_host_count_i +#cr_host_count_i ## the chance to click on this page when randomly clicking on links within on one host -cr_host_chance_d +#cr_host_chance_d ## normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10 -cr_host_norm_i +#cr_host_norm_i ## custom rating; to be set with external rating information rating_i diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema index 0ba0cce70..f0a9dc987 100644 --- a/defaults/solr.webgraph.schema +++ b/defaults/solr.webgraph.schema @@ -75,7 +75,7 @@ source_id_s #source_clickdepth_i ## copy of the citation rank norm value from the source link -source_cr_host_norm_i +#source_cr_host_norm_i ## host of the url (source) @@ -176,7 +176,7 @@ target_path_folders_sxt #target_clickdepth_i ## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host -target_cr_host_norm_i +#target_cr_host_norm_i ## host of the url (target) diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index d9d052f38..94c6e9ec9 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -898,17 +898,33 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (!this.contains(CollectionSchema.process_sxt)) return 0; if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0; SolrConnector collectionConnector = segment.fulltext().getDefaultConnector(); - SolrConnector webgraphConnector = segment.fulltext().useWebgraph() ? segment.fulltext().getWebgraphConnector() : null; collectionConnector.commit(false); // make sure that we have latest information that can be found - if (webgraphConnector != null) webgraphConnector.commit(false); - Map ranking = new TreeMap(Base64Order.enhancedCoder); - ReversibleScoreMap hostscore = null; + if (segment.fulltext().useWebgraph()) segment.fulltext().getWebgraphConnector().commit(false); + CollectionConfiguration collection = segment.fulltext().getDefaultConfiguration(); + WebgraphConfiguration webgraph = segment.fulltext().getWebgraphConfiguration(); + + + // collect hosts from index which shall take part in citation computation + String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + + CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(); + ReversibleScoreMap hostscore; try { - // collect hosts from index which shall take part in citation computation - String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + - CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(); hostscore = collectionConnector.getFacets(query, 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); + } catch (final IOException e2) { + ConcurrentLog.logException(e2); + hostscore = new ClusteredScoreMap(); + } + + // create the ranking map + Map ranking = null; + if ((segment.fulltext().useWebgraph() && + ((webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) || + (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i))) || + (collection.contains(CollectionSchema.cr_host_count_i) && + collection.contains(CollectionSchema.cr_host_chance_d) && + collection.contains(CollectionSchema.cr_host_norm_i)))) try { ConcurrentLog.info("CollectionConfiguration", "collecting " + hostscore.size() + " hosts"); + ranking = new TreeMap(Base64Order.enhancedCoder); int countcheck = 0; for (String host: hostscore.keyList(true)) { // Patch the citation index for links with canonical tags. @@ -966,40 +982,42 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } if (hostscore.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + hostscore.size() + ", counted=" + countcheck); } catch (final IOException e2) { + ConcurrentLog.logException(e2); hostscore = new ClusteredScoreMap(); } // process all documents at the webgraph for the outgoing links of this document SolrDocument doc; - if (webgraphConnector != null) { + if (segment.fulltext().useWebgraph()) { try { for (String host: hostscore.keyList(true)) { if (hostscore.get(host) <= 0) continue; // select all webgraph edges and modify their cr value - String query = "{!raw f=" + WebgraphSchema.source_host_s.getSolrFieldName() + "}" + host; - long count = webgraphConnector.getCountByQuery(query); + query = "{!raw f=" + WebgraphSchema.source_host_s.getSolrFieldName() + "}" + host; + long count = segment.fulltext().getWebgraphConnector().getCountByQuery(query); ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph"); - BlockingQueue docs = webgraphConnector.concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100); + BlockingQueue docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100); int countcheck = 0; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { - boolean changed = false; - SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null); - byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName())); - CRV crv = ranking.get(id); - if (crv != null) { - sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn); - changed = true; + SolrInputDocument sid = webgraph.toSolrInputDocument(doc, null); + if (webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) { + byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName())); + CRV crv = ranking.get(id); + if (crv != null) { + sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn); + } } - id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName())); - crv = ranking.get(id); - if (crv != null) { - sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn); - changed = true; + if (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i)) { + byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName())); + CRV crv = ranking.get(id); + if (crv != null) { + sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn); + } } - if (changed) try { + try { sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName()); sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName()); - webgraphConnector.add(sid); + segment.fulltext().getWebgraphConnector().add(sid); } catch (SolrException e) { ConcurrentLog.logException(e); } catch (IOException e) { @@ -1017,7 +1035,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } // process all documents in collection - String query = (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + + query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]"; int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id @@ -1044,7 +1062,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i, 100)) proccount_clickdepthchange++; } - if (tagtype == ProcessType.CITATION) { + if (tagtype == ProcessType.CITATION && + collection.contains(CollectionSchema.cr_host_count_i) && + collection.contains(CollectionSchema.cr_host_chance_d) && + collection.contains(CollectionSchema.cr_host_norm_i)) { CRV crv = ranking.get(id); if (crv != null) { sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count); diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index dcf47277b..9ac463a6c 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -214,7 +214,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial add(edge, WebgraphSchema.source_path_folders_count_i, paths.length); add(edge, WebgraphSchema.source_path_folders_sxt, paths); } - if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) { + if (this.contains(WebgraphSchema.source_clickdepth_i) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) { add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source); if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH); } @@ -336,7 +336,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial // switch over tag types ProcessType tagtype = ProcessType.valueOf((String) tag); if (tagtype == ProcessType.CLICKDEPTH) { - if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) { + if (this.contains(WebgraphSchema.source_clickdepth_i) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) { protocol = (String) doc.getFieldValue(WebgraphSchema.source_protocol_s.getSolrFieldName()); urlstub = (String) doc.getFieldValue(WebgraphSchema.source_urlstub_s.getSolrFieldName()); id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); @@ -347,7 +347,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } //ConcurrentLog.info("WebgraphConfiguration", "postprocessing webgraph source id " + id + ", url=" + protocol + "://" + urlstub + ", result: " + (changed ? "changed" : "not changed")); } - if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { + if (this.contains(WebgraphSchema.target_clickdepth_i) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { protocol = (String) doc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()); urlstub = (String) doc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName()); id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());