diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 79f87177a..d38af935d 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -133,7 +133,7 @@ public final class CrawlStacker { // this is the method that is called by the busy thread from outside if (entry == null) return null; - // record the link graph for this request + // record the link graph for this request; this can be overwritten, replaced and enhanced by an index writing process in Segment.storeDocument byte[] anchorhash = entry.url().hash(); IndexCell urlCitationIndex = this.indexSegment.urlCitation(); if (urlCitationIndex != null && entry.referrerhash() != null) try { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 6ebb21dc1..361dd4b61 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -32,6 +32,7 @@ import java.net.MalformedURLException; import java.util.Collection; import java.util.Date; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; @@ -43,6 +44,7 @@ import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; @@ -619,7 +621,7 @@ public class Segment { char docType = Response.docType(document.dc_format()); // CREATE SOLR DOCUMENT - final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration(), sourceName); + final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName); // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.connectedCitation()) { @@ -628,21 +630,45 @@ public class Segment { // STORE TO SOLR String error = null; this.putDocumentInQueue(vector); - if (this.fulltext.writeToWebgraph()) { - tryloop: for (int i = 0; i < 20; i++) { - try { - error = null; - this.fulltext.putEdges(vector.getWebgraphDocuments()); - break tryloop; - } catch (final IOException e ) { - error = "failed to send " + urlNormalform + " to solr: " + e.getMessage(); - ConcurrentLog.warn("SOLR", error); - if (i == 10) this.fulltext.commit(true); - try {Thread.sleep(1000);} catch (final InterruptedException e1) {} - continue tryloop; + List webgraph = vector.getWebgraphDocuments(); + if (webgraph != null && webgraph.size() > 0) { + + // write the edges to the webgraph solr index + if (this.fulltext.writeToWebgraph()) { + tryloop: for (int i = 0; i < 20; i++) { + try { + error = null; + this.fulltext.putEdges(webgraph); + break tryloop; + } catch (final IOException e ) { + error = "failed to send " + urlNormalform + " to solr: " + e.getMessage(); + ConcurrentLog.warn("SOLR", error); + if (i == 10) this.fulltext.commit(true); + try {Thread.sleep(1000);} catch (final InterruptedException e1) {} + continue tryloop; + } + } + } + + // write the edges to the citation reference index + if (this.connectedCitation()) try { + // normal links + for (SolrInputDocument edge: webgraph) { + String referrerhash = (String) edge.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); + String anchorhash = (String) edge.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()); + if (referrerhash != null && anchorhash != null) { + urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime())); + } } + // media links as well! + for (AnchorURL image: document.getImages().keySet()) urlCitationIndex.add(image.hash(), new CitationReference(url.hash(), loadDate.getTime())); + for (AnchorURL audio: document.getAudiolinks().keySet()) urlCitationIndex.add(audio.hash(), new CitationReference(url.hash(), loadDate.getTime())); + for (AnchorURL video: document.getVideolinks().keySet()) urlCitationIndex.add(video.hash(), new CitationReference(url.hash(), loadDate.getTime())); + } catch (Throwable e) { + ConcurrentLog.logException(e); } } + if (error != null) { ConcurrentLog.severe("SOLR", error + ", PLEASE REPORT TO bugs.yacy.net"); //Switchboard.getSwitchboard().pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, error); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 73e45874a..45a902c73 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -76,6 +76,7 @@ import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.RowHandleMap; import net.yacy.kelondro.rwi.IndexCell; +import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.Bitfield; import net.yacy.search.index.Segment; import net.yacy.search.index.Segment.ReferenceReport; @@ -339,7 +340,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public SolrVector yacy2solr( final Map collections, final ResponseHeader responseHeader, final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, - final IndexCell citations, final WebgraphConfiguration webgraph, final String sourceName) { // we use the SolrCell design as index schema SolrVector doc = new SolrVector(); @@ -353,7 +353,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri String us = digestURL.toNormalform(true); int clickdepth = 999; - if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) { + if ((allAttr || contains(CollectionSchema.clickdepth_i))) { if (digestURL.probablyRootURL()) { clickdepth = 0; } else { @@ -818,7 +818,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // create a subgraph if (!containsCanonical) { // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document - webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations, sourceName); + webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), sourceName); } // list all links @@ -897,8 +897,40 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(), 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); if (hostscore == null) hostscore = new ClusteredScoreMap(); - // for each host, do a citation rank computation + for (String host: hostscore.keyList(true)) { + // Patch the citation index for links with canonical tags. + // This shall fulfill the following requirement: + // If a document A links to B and B contains a 'canonical C', then the citation rank coputation shall consider that A links to C and B does not link to C. + // To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links + BlockingQueue documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery( + CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]", + 0, 10000000, 60000L, 50, + CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName()); + SolrDocument doc_B; + try { + while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + // find all documents which link to the canonical doc + DigestURL doc_C_url = new DigestURL((String) doc_B.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName())); + byte[] doc_B_id = ASCII.getBytes(((String) doc_B.getFieldValue(CollectionSchema.id.getSolrFieldName()))); + // we remove all references to B, because these become references to C + ReferenceContainer doc_A_ids = segment.urlCitation().remove(doc_B_id); + if (doc_A_ids == null) { + //System.out.println("*** document with canonical but no referrer: " + doc_B.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + continue; // the document has a canonical tag but no referrer? + } + Iterator doc_A_ids_iterator = doc_A_ids.entries(); + // for each of the referrer A of B, set A as a referrer of C + while (doc_A_ids_iterator.hasNext()) { + CitationReference doc_A_citation = doc_A_ids_iterator.next(); + segment.urlCitation().add(doc_C_url.hash(), doc_A_citation); + } + } + } catch (InterruptedException e) { + } catch (SpaceExceededException e) { + } + + // do the citation rank computation if (hostscore.get(host) <= 0) continue; // select all documents for each host CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6); diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index 09667711f..d0b6d3a18 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -55,8 +55,6 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.citation.CitationReference; -import net.yacy.kelondro.rwi.IndexCell; import net.yacy.search.index.Segment; public class WebgraphConfiguration extends SchemaConfiguration implements Serializable { @@ -117,7 +115,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial final Subgraph subgraph, final DigestURL source, final ResponseHeader responseHeader, Map collections, int clickdepth_source, final List images, final boolean inbound, final Collection links, - final IndexCell citations, final String sourceName) { + final String sourceName) { boolean allAttr = this.isEmpty(); int target_order = 0; boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0; @@ -268,7 +266,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { - if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && citations != null) { + if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) { if (target_url.probablyRootURL()) { boolean lc = this.lazy; this.lazy = false; add(edge, WebgraphSchema.target_clickdepth_i, 0);