Merge branch 'master' into crawlexpert-post

11 years ago · 85316b3ac6
parent 42ea56eaad ba3c173077
commit 85316b3ac6
7 changed files with 118 additions and 39 deletions
--- a/defaults/solr.webgraph.schema
+++ b/defaults/solr.webgraph.schema
@ -74,6 +74,10 @@ source_id_s
 ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
 #source_clickdepth_i

+## copy of the citation rank norm value from the source link
+source_cr_host_norm_i
+
+
 ## host of the url (source)
 #source_host_s

@ -171,6 +175,10 @@ target_path_folders_sxt
 ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
 #target_clickdepth_i

+## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
+target_cr_host_norm_i
+
+
 ## host of the url (target)
 #target_host_s

--- a/source/net/yacy/cora/federate/solr/Ranking.java
+++ b/source/net/yacy/cora/federate/solr/Ranking.java
@ -24,6 +24,8 @@ import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.Set;

+import org.openjena.atlas.logging.Log;
+
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.search.schema.CollectionSchema;

@ -75,16 +77,22 @@ public class Ranking {
     * @param boostDef the definition string
     */
    public void updateBoosts(String boostDef) {
-        // call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
+        // call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description_txt^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
        if (boostDef == null || boostDef.length() == 0) return;
        String[] bf = CommonPattern.COMMA.split(boostDef);
        this.fieldBoosts.clear();
        for (String boost: bf) {
            int p = boost.indexOf('^');
            if (p < 0) continue;
-            CollectionSchema field = CollectionSchema.valueOf(boost.substring(0, p));
-            Float factor = Float.parseFloat(boost.substring(p + 1));
-            this.fieldBoosts.put(field, factor);
+            String boostkey = boost.substring(0, p);
+            try {
+                CollectionSchema field = CollectionSchema.valueOf(boostkey);
+                Float factor = Float.parseFloat(boost.substring(p + 1));
+                this.fieldBoosts.put(field, factor);
+            } catch (IllegalArgumentException e) {
+                // boostkey is unknown; ignore it but print warning
+                Log.warn("Ranking", "unknwon boost key '" + boostkey + "'");
+            }
        }
    }

--- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
+++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@ -33,6 +33,7 @@ import java.util.Set;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;

 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
@ -78,6 +79,34 @@ public class SchemaConfiguration extends Configuration implements Serializable {
        }
    }
    
+    /**
+     * Convert a SolrDocument to a SolrInputDocument.
+     * This is useful if a document from the search index shall be modified and indexed again.
+     * This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
+     * which are created automatically during the indexing process.
+     * @param doc the solr document
+     * @return a solr input document
+     */
+    public SolrInputDocument toSolrInputDocument(final SolrDocument doc, Set<String> omitFields) {
+        SolrInputDocument sid = new SolrInputDocument();
+        for (String name: doc.getFieldNames()) {
+            if (this.contains(name) && (omitFields == null || !omitFields.contains(name))) { // check each field if enabled in local Solr schema
+                sid.addField(name, doc.getFieldValue(name), 1.0f);
+            }
+        }
+        return sid;
+    }
+    
+    public SolrDocument toSolrDocument(final SolrInputDocument doc, Set<String> omitFields) {
+        SolrDocument sd = new SolrDocument();
+        for (SolrInputField field: doc) {
+            if (this.contains(field.getName()) && (omitFields == null || !omitFields.contains(field.getName()))) { // check each field if enabled in local Solr schema
+                sd.setField(field.getName(), field.getValue());
+            }
+        }
+        return sd;
+    }
+    
    public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
        boolean changed = false;
        // FIND OUT IF THIS IS A DOUBLE DOCUMENT
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -48,8 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.CrawlQueues;
 import net.yacy.crawler.data.NoticedURL;
-import net.yacy.crawler.data.ResultURLs;
-import net.yacy.crawler.data.ResultURLs.EventOrigin;
 import net.yacy.crawler.retrieval.FTPLoader;
 import net.yacy.crawler.retrieval.HTTPLoader;
 import net.yacy.crawler.retrieval.Request;
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@ -589,7 +589,7 @@ public class Response {

            // -if-modified-since in request
            // if the page is fresh at the very moment we can index it
-            final Date ifModifiedSince = this.requestHeader.ifModifiedSince();
+            final Date ifModifiedSince = this.ifModifiedSince();
            if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
                // parse date
                Date d = this.responseHeader.lastModified();
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -84,8 +84,8 @@ import net.yacy.search.query.QueryParams;
 import net.yacy.search.schema.WebgraphConfiguration.Subgraph;

 import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.SolrInputField;


 public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
@ -169,32 +169,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
    }
    
-    /**
-     * Convert a SolrDocument to a SolrInputDocument.
-     * This is useful if a document from the search index shall be modified and indexed again.
-     * This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
-     * which are created automatically during the indexing process.
-     * @param doc the solr document
-     * @return a solr input document
-     */
    public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
-        SolrInputDocument sid = new SolrInputDocument();
-        for (String name: doc.getFieldNames()) {
-            if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema
-                sid.addField(name, doc.getFieldValue(name), 1.0f);
-            }
-        }
-        return sid;
+        return toSolrInputDocument(doc, omitFields);
    }
    
    public SolrDocument toSolrDocument(final SolrInputDocument doc) {
-        SolrDocument sd = new SolrDocument();
-        for (SolrInputField field: doc) {
-            if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema
-                sd.setField(field.getName(), field.getValue());
-            }
-        }
-        return sd;
+        return toSolrDocument(doc, omitFields);
    }
    
    /**
@ -691,7 +671,23 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri

            // canonical tag
            if (allAttr || contains(CollectionSchema.canonical_s)) {
-                final DigestURL canonical = html.getCanonical();
+                DigestURL canonical = html.getCanonical();
+                // if there is no canonical in the html then look into the http header:
+                if (canonical == null) {
+                    String link = responseHeader.get("Link", null);
+                    int p;
+                    if (link != null && ((p = link.indexOf("rel=\"canonical\"")) > 0)) {
+                        link = link.substring(0, p).trim();
+                        p = link.indexOf('<');
+                        int q = link.lastIndexOf('>');
+                        if (p >= 0 && q > 0) {
+                            link = link.substring(p + 1, q);
+                            try {
+                                canonical = new DigestURL(link);
+                            } catch (MalformedURLException e) {}
+                        }
+                    }
+                }
                if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) {
                    containsCanonical = true;
                    inboundLinks.remove(canonical);
@ -888,16 +884,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
    public int postprocessing(final Segment segment, String harvestkey) {
        if (!this.contains(CollectionSchema.process_sxt)) return 0;
        if (!segment.connectedCitation()) return 0;
-        SolrConnector connector = segment.fulltext().getDefaultConnector();
-        connector.commit(true); // make sure that we have latest information that can be found
+        SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
+        SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
+        collectionConnector.commit(true); // make sure that we have latest information that can be found
        ReferenceReportCache rrCache = segment.getReferenceReportCache();
        Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
+        ReversibleScoreMap<String> hostscore = null;
        try {
            // collect hosts from index which shall take part in citation computation
-            ReversibleScoreMap<String> hostscore = connector.getFacets(
+            hostscore = collectionConnector.getFacets(
                    (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
                    CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
-                    10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
+                    10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
            if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
            // for each host, do a citation rank computation
            for (String host: hostscore.keyList(true)) {
@ -915,14 +913,49 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                ranking.putAll(crn); // accumulate this here for usage in document update later
            }
        } catch (final IOException e2) {
+            hostscore = new ClusteredScoreMap<String>();
        }
        
-        // process all documents
-        BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(
+        // process all documents at the webgraph for the outgoing links of this document
+        SolrDocument doc;
+        if (webgraphConnector != null) {
+            for (String host: hostscore.keyList(true)) {
+                if (hostscore.get(host) <= 0) continue;
+                // select all webgraph edges and modify their cr value
+                BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(
+                        WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"",
+                        0, 10000000, 60000, 50);
+                try {
+                    while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+                        boolean changed = false;
+                        SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
+                        byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
+                        CRV crv = ranking.get(id);
+                        if (crv != null) {
+                            sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
+                            changed = true;
+                        }
+                        id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
+                        crv = ranking.get(id);
+                        if (crv != null) {
+                            sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
+                            changed = true;
+                        }
+                        if (changed) try {
+                            webgraphConnector.add(sid);
+                        } catch (SolrException e) {
+                        } catch (IOException e) {
+                       }
+                    }
+                } catch (final InterruptedException e) {}
+            }
+        }
+        
+        // process all documents in collection
+        BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
                (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
                CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
                0, 10000, 60000, 50);
-        SolrDocument doc;
        int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
        Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
        Set<String> uniqueURLs = new HashSet<String>();
@ -976,7 +1009,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                    
                    // send back to index
                    //connector.deleteById(ASCII.String(id));
-                    connector.add(sid);
+                    collectionConnector.add(sid);
+                    
                    proccount++;
                } catch (final Throwable e1) {
                }
--- a/source/net/yacy/search/schema/WebgraphSchema.java
+++ b/source/net/yacy/search/schema/WebgraphSchema.java
@ -52,6 +52,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
    source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
    source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
    source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
+    source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),

    source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
    source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"),
@ -86,6 +87,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
    target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
    target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
    target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
+    target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),

    target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),
    target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"),