diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema index ceae4cd87..0ba0cce70 100644 --- a/defaults/solr.webgraph.schema +++ b/defaults/solr.webgraph.schema @@ -74,6 +74,10 @@ source_id_s ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source) #source_clickdepth_i +## copy of the citation rank norm value from the source link +source_cr_host_norm_i + + ## host of the url (source) #source_host_s @@ -171,6 +175,10 @@ target_path_folders_sxt ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target) #target_clickdepth_i +## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host +target_cr_host_norm_i + + ## host of the url (target) #target_host_s diff --git a/source/net/yacy/cora/federate/solr/Ranking.java b/source/net/yacy/cora/federate/solr/Ranking.java index dc20138e3..b61280ab0 100644 --- a/source/net/yacy/cora/federate/solr/Ranking.java +++ b/source/net/yacy/cora/federate/solr/Ranking.java @@ -24,6 +24,8 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; +import org.openjena.atlas.logging.Log; + import net.yacy.cora.util.CommonPattern; import net.yacy.search.schema.CollectionSchema; @@ -75,16 +77,22 @@ public class Ranking { * @param boostDef the definition string */ public void updateBoosts(String boostDef) { - // call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0" + // call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description_txt^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0" if (boostDef == null || boostDef.length() == 0) return; String[] bf = CommonPattern.COMMA.split(boostDef); this.fieldBoosts.clear(); for (String boost: bf) { int p = boost.indexOf('^'); if (p < 0) continue; - CollectionSchema field = CollectionSchema.valueOf(boost.substring(0, p)); - Float factor = Float.parseFloat(boost.substring(p + 1)); - this.fieldBoosts.put(field, factor); + String boostkey = boost.substring(0, p); + try { + CollectionSchema field = CollectionSchema.valueOf(boostkey); + Float factor = Float.parseFloat(boost.substring(p + 1)); + this.fieldBoosts.put(field, factor); + } catch (IllegalArgumentException e) { + // boostkey is unknown; ignore it but print warning + Log.warn("Ranking", "unknwon boost key '" + boostkey + "'"); + } } } diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index d976ae515..d1ad4301d 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -33,6 +33,7 @@ import java.util.Set; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.SolrInputField; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; @@ -78,6 +79,34 @@ public class SchemaConfiguration extends Configuration implements Serializable { } } + /** + * Convert a SolrDocument to a SolrInputDocument. + * This is useful if a document from the search index shall be modified and indexed again. + * This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields + * which are created automatically during the indexing process. + * @param doc the solr document + * @return a solr input document + */ + public SolrInputDocument toSolrInputDocument(final SolrDocument doc, Set omitFields) { + SolrInputDocument sid = new SolrInputDocument(); + for (String name: doc.getFieldNames()) { + if (this.contains(name) && (omitFields == null || !omitFields.contains(name))) { // check each field if enabled in local Solr schema + sid.addField(name, doc.getFieldValue(name), 1.0f); + } + } + return sid; + } + + public SolrDocument toSolrDocument(final SolrInputDocument doc, Set omitFields) { + SolrDocument sd = new SolrDocument(); + for (SolrInputField field: doc) { + if (this.contains(field.getName()) && (omitFields == null || !omitFields.contains(field.getName()))) { // check each field if enabled in local Solr schema + sd.setField(field.getName(), field.getValue()); + } + } + return sd; + } + public boolean postprocessing_doublecontent(Segment segment, Set uniqueURLs, SolrInputDocument sid, DigestURL url) { boolean changed = false; // FIND OUT IF THIS IS A DOUBLE DOCUMENT diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 54908e1df..79f87177a 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -48,8 +48,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.NoticedURL; -import net.yacy.crawler.data.ResultURLs; -import net.yacy.crawler.data.ResultURLs.EventOrigin; import net.yacy.crawler.retrieval.FTPLoader; import net.yacy.crawler.retrieval.HTTPLoader; import net.yacy.crawler.retrieval.Request; diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 17b9d8c16..e6b859115 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -589,7 +589,7 @@ public class Response { // -if-modified-since in request // if the page is fresh at the very moment we can index it - final Date ifModifiedSince = this.requestHeader.ifModifiedSince(); + final Date ifModifiedSince = this.ifModifiedSince(); if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) { // parse date Date d = this.responseHeader.lastModified(); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 4a978f067..73e45874a 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -84,8 +84,8 @@ import net.yacy.search.query.QueryParams; import net.yacy.search.schema.WebgraphConfiguration.Subgraph; import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.SolrInputField; public class CollectionConfiguration extends SchemaConfiguration implements Serializable { @@ -169,32 +169,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName()); } - /** - * Convert a SolrDocument to a SolrInputDocument. - * This is useful if a document from the search index shall be modified and indexed again. - * This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields - * which are created automatically during the indexing process. - * @param doc the solr document - * @return a solr input document - */ public SolrInputDocument toSolrInputDocument(final SolrDocument doc) { - SolrInputDocument sid = new SolrInputDocument(); - for (String name: doc.getFieldNames()) { - if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema - sid.addField(name, doc.getFieldValue(name), 1.0f); - } - } - return sid; + return toSolrInputDocument(doc, omitFields); } public SolrDocument toSolrDocument(final SolrInputDocument doc) { - SolrDocument sd = new SolrDocument(); - for (SolrInputField field: doc) { - if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema - sd.setField(field.getName(), field.getValue()); - } - } - return sd; + return toSolrDocument(doc, omitFields); } /** @@ -691,7 +671,23 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // canonical tag if (allAttr || contains(CollectionSchema.canonical_s)) { - final DigestURL canonical = html.getCanonical(); + DigestURL canonical = html.getCanonical(); + // if there is no canonical in the html then look into the http header: + if (canonical == null) { + String link = responseHeader.get("Link", null); + int p; + if (link != null && ((p = link.indexOf("rel=\"canonical\"")) > 0)) { + link = link.substring(0, p).trim(); + p = link.indexOf('<'); + int q = link.lastIndexOf('>'); + if (p >= 0 && q > 0) { + link = link.substring(p + 1, q); + try { + canonical = new DigestURL(link); + } catch (MalformedURLException e) {} + } + } + } if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) { containsCanonical = true; inboundLinks.remove(canonical); @@ -888,16 +884,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public int postprocessing(final Segment segment, String harvestkey) { if (!this.contains(CollectionSchema.process_sxt)) return 0; if (!segment.connectedCitation()) return 0; - SolrConnector connector = segment.fulltext().getDefaultConnector(); - connector.commit(true); // make sure that we have latest information that can be found + SolrConnector collectionConnector = segment.fulltext().getDefaultConnector(); + SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector(); + collectionConnector.commit(true); // make sure that we have latest information that can be found ReferenceReportCache rrCache = segment.getReferenceReportCache(); Map ranking = new TreeMap(Base64Order.enhancedCoder); + ReversibleScoreMap hostscore = null; try { // collect hosts from index which shall take part in citation computation - ReversibleScoreMap hostscore = connector.getFacets( + hostscore = collectionConnector.getFacets( (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(), - 10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); + 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); if (hostscore == null) hostscore = new ClusteredScoreMap(); // for each host, do a citation rank computation for (String host: hostscore.keyList(true)) { @@ -915,14 +913,49 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri ranking.putAll(crn); // accumulate this here for usage in document update later } } catch (final IOException e2) { + hostscore = new ClusteredScoreMap(); } - // process all documents - BlockingQueue docs = connector.concurrentDocumentsByQuery( + // process all documents at the webgraph for the outgoing links of this document + SolrDocument doc; + if (webgraphConnector != null) { + for (String host: hostscore.keyList(true)) { + if (hostscore.get(host) <= 0) continue; + // select all webgraph edges and modify their cr value + BlockingQueue docs = webgraphConnector.concurrentDocumentsByQuery( + WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"", + 0, 10000000, 60000, 50); + try { + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + boolean changed = false; + SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null); + byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName())); + CRV crv = ranking.get(id); + if (crv != null) { + sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn); + changed = true; + } + id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName())); + crv = ranking.get(id); + if (crv != null) { + sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn); + changed = true; + } + if (changed) try { + webgraphConnector.add(sid); + } catch (SolrException e) { + } catch (IOException e) { + } + } + } catch (final InterruptedException e) {} + } + } + + // process all documents in collection + BlockingQueue docs = collectionConnector.concurrentDocumentsByQuery( (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50); - SolrDocument doc; int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id Set uniqueURLs = new HashSet(); @@ -976,7 +1009,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // send back to index //connector.deleteById(ASCII.String(id)); - connector.add(sid); + collectionConnector.add(sid); + proccount++; } catch (final Throwable e1) { } diff --git a/source/net/yacy/search/schema/WebgraphSchema.java b/source/net/yacy/search/schema/WebgraphSchema.java index 2cd80994c..f5f0f3700 100644 --- a/source/net/yacy/search/schema/WebgraphSchema.java +++ b/source/net/yacy/search/schema/WebgraphSchema.java @@ -52,6 +52,7 @@ public enum WebgraphSchema implements SchemaDeclaration { source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"), source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"), source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"), + source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"), source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"), source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"), @@ -86,6 +87,7 @@ public enum WebgraphSchema implements SchemaDeclaration { target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"), target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"), target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"), + target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"), target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"), target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"),