Patch the citation index for links with canonical tags.

This shall fulfill the following requirement: If a document A links to B and B contains a 'canonical C', then the citation rank computation shall consider that A links to C and B does not link to C. To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links.
11 years ago · 101a6e6e14
parent ba3c173077
commit 101a6e6e14
4 changed files with 78 additions and 22 deletions
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -133,7 +133,7 @@ public final class CrawlStacker {
        // this is the method that is called by the busy thread from outside
        if (entry == null) return null;

-        // record the link graph for this request
+        // record the link graph for this request; this can be overwritten, replaced and enhanced by an index writing process in Segment.storeDocument
        byte[] anchorhash = entry.url().hash();
        IndexCell<CitationReference> urlCitationIndex = this.indexSegment.urlCitation();
        if (urlCitationIndex != null && entry.referrerhash() != null) try {
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -32,6 +32,7 @@ import java.net.MalformedURLException;
 import java.util.Collection;
 import java.util.Date;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
@ -43,6 +44,7 @@ import org.apache.solr.common.SolrInputDocument;

 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.encoding.UTF8;
+import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
@ -619,7 +621,7 @@ public class Segment {
        char docType = Response.docType(document.dc_format());
        
        // CREATE SOLR DOCUMENT
-        final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration(), sourceName);
+        final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName);
        
        // ENRICH DOCUMENT WITH RANKING INFORMATION
        if (this.connectedCitation()) {
@ -628,21 +630,45 @@ public class Segment {
        // STORE TO SOLR
        String error = null;
        this.putDocumentInQueue(vector);
-        if (this.fulltext.writeToWebgraph()) {
-            tryloop: for (int i = 0; i < 20; i++) {
-                try {
-                    error = null;
-                    this.fulltext.putEdges(vector.getWebgraphDocuments());
-                    break tryloop;
-                } catch (final IOException e ) {
-                    error = "failed to send " + urlNormalform + " to solr: " + e.getMessage();
-                    ConcurrentLog.warn("SOLR", error);
-                    if (i == 10) this.fulltext.commit(true);
-                    try {Thread.sleep(1000);} catch (final InterruptedException e1) {}
-                    continue tryloop;
+        List<SolrInputDocument> webgraph = vector.getWebgraphDocuments();
+        if (webgraph != null && webgraph.size() > 0) {
+            
+            // write the edges to the webgraph solr index
+            if (this.fulltext.writeToWebgraph()) {
+                tryloop: for (int i = 0; i < 20; i++) {
+                    try {
+                        error = null;
+                        this.fulltext.putEdges(webgraph);
+                        break tryloop;
+                    } catch (final IOException e ) {
+                        error = "failed to send " + urlNormalform + " to solr: " + e.getMessage();
+                        ConcurrentLog.warn("SOLR", error);
+                        if (i == 10) this.fulltext.commit(true);
+                        try {Thread.sleep(1000);} catch (final InterruptedException e1) {}
+                        continue tryloop;
+                    }
+                }
+            }
+        
+            // write the edges to the citation reference index
+            if (this.connectedCitation()) try {
+                // normal links
+                for (SolrInputDocument edge: webgraph) {
+                    String referrerhash = (String) edge.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
+                    String anchorhash = (String) edge.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
+                    if (referrerhash != null && anchorhash != null) {
+                        urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
+                    }
                }
+                // media links as well!
+                for (AnchorURL image: document.getImages().keySet()) urlCitationIndex.add(image.hash(), new CitationReference(url.hash(), loadDate.getTime()));
+                for (AnchorURL audio: document.getAudiolinks().keySet()) urlCitationIndex.add(audio.hash(), new CitationReference(url.hash(), loadDate.getTime()));
+                for (AnchorURL video: document.getVideolinks().keySet()) urlCitationIndex.add(video.hash(), new CitationReference(url.hash(), loadDate.getTime()));
+            } catch (Throwable e) {
+                ConcurrentLog.logException(e);
            }
        }
+        
        if (error != null) {
            ConcurrentLog.severe("SOLR", error + ", PLEASE REPORT TO bugs.yacy.net");
            //Switchboard.getSwitchboard().pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, error);
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -76,6 +76,7 @@ import net.yacy.kelondro.data.citation.CitationReference;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
 import net.yacy.kelondro.index.RowHandleMap;
 import net.yacy.kelondro.rwi.IndexCell;
+import net.yacy.kelondro.rwi.ReferenceContainer;
 import net.yacy.kelondro.util.Bitfield;
 import net.yacy.search.index.Segment;
 import net.yacy.search.index.Segment.ReferenceReport;
@ -339,7 +340,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
    public SolrVector yacy2solr(
            final Map<String, Pattern> collections, final ResponseHeader responseHeader,
            final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
-            final IndexCell<CitationReference> citations,
            final WebgraphConfiguration webgraph, final String sourceName) {
        // we use the SolrCell design as index schema
        SolrVector doc = new SolrVector();
@ -353,7 +353,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        String us = digestURL.toNormalform(true);

        int clickdepth = 999;
-        if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) {
+        if ((allAttr || contains(CollectionSchema.clickdepth_i))) {
            if (digestURL.probablyRootURL()) {
                clickdepth = 0;
            } else {
@ -818,7 +818,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        // create a subgraph
        if (!containsCanonical) {
            // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
-            webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations, sourceName);
+            webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), sourceName);
        }
            
        // list all links
@ -897,8 +897,40 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                    CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
                    10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
            if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
-            // for each host, do a citation rank computation
+
            for (String host: hostscore.keyList(true)) {
+                // Patch the citation index for links with canonical tags.
+                // This shall fulfill the following requirement:
+                // If a document A links to B and B contains a 'canonical C', then the citation rank coputation shall consider that A links to C and B does not link to C.
+                // To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
+                BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(
+                        CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]",
+                        0, 10000000, 60000L, 50,
+                        CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
+                SolrDocument doc_B;
+                try {
+                    while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+                        // find all documents which link to the canonical doc
+                        DigestURL doc_C_url = new DigestURL((String) doc_B.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()));
+                        byte[] doc_B_id = ASCII.getBytes(((String) doc_B.getFieldValue(CollectionSchema.id.getSolrFieldName())));
+                        // we remove all references to B, because these become references to C
+                        ReferenceContainer<CitationReference> doc_A_ids = segment.urlCitation().remove(doc_B_id);
+                        if (doc_A_ids == null) {
+                            //System.out.println("*** document with canonical but no referrer: " + doc_B.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
+                            continue; // the document has a canonical tag but no referrer?
+                        }
+                        Iterator<CitationReference> doc_A_ids_iterator = doc_A_ids.entries();
+                        // for each of the referrer A of B, set A as a referrer of C
+                        while (doc_A_ids_iterator.hasNext()) {
+                            CitationReference doc_A_citation = doc_A_ids_iterator.next();
+                            segment.urlCitation().add(doc_C_url.hash(), doc_A_citation);
+                        }
+                    }
+                } catch (InterruptedException e) {
+                } catch (SpaceExceededException e) {
+                }
+                
+                // do the citation rank computation
                if (hostscore.get(host) <= 0) continue;
                // select all documents for each host
                CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6);
--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@ -55,8 +55,6 @@ import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.parser.html.ImageEntry;
-import net.yacy.kelondro.data.citation.CitationReference;
-import net.yacy.kelondro.rwi.IndexCell;
 import net.yacy.search.index.Segment;

 public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
@ -117,7 +115,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
            final Subgraph subgraph,
            final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
            final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links,
-            final IndexCell<CitationReference> citations, final String sourceName) {
+            final String sourceName) {
        boolean allAttr = this.isEmpty();
        int target_order = 0;
        boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
@ -268,7 +266,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
            }

            if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
-                if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && citations != null) {
+                if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) {
                    if (target_url.probablyRootURL()) {
                        boolean lc = this.lazy; this.lazy = false;
                        add(edge, WebgraphSchema.target_clickdepth_i, 0);