Patch the citation index for links with canonical tags.

This shall fulfill the following requirement:
If a document A links to B and B contains a 'canonical C', then the
citation rank computation shall consider that A links to C and B does
not link to C.
To do so, we first must collect all canonical links, find all references
to them, get the anchor list of the documents and patch the citation
reference of these links.
pull/1/head
Michael Peter Christen 11 years ago
parent ba3c173077
commit 101a6e6e14

@ -133,7 +133,7 @@ public final class CrawlStacker {
// this is the method that is called by the busy thread from outside
if (entry == null) return null;
// record the link graph for this request
// record the link graph for this request; this can be overwritten, replaced and enhanced by an index writing process in Segment.storeDocument
byte[] anchorhash = entry.url().hash();
IndexCell<CitationReference> urlCitationIndex = this.indexSegment.urlCitation();
if (urlCitationIndex != null && entry.referrerhash() != null) try {

@ -32,6 +32,7 @@ import java.net.MalformedURLException;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
@ -43,6 +44,7 @@ import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
@ -619,7 +621,7 @@ public class Segment {
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration(), sourceName);
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) {
@ -628,21 +630,45 @@ public class Segment {
// STORE TO SOLR
String error = null;
this.putDocumentInQueue(vector);
if (this.fulltext.writeToWebgraph()) {
tryloop: for (int i = 0; i < 20; i++) {
try {
error = null;
this.fulltext.putEdges(vector.getWebgraphDocuments());
break tryloop;
} catch (final IOException e ) {
error = "failed to send " + urlNormalform + " to solr: " + e.getMessage();
ConcurrentLog.warn("SOLR", error);
if (i == 10) this.fulltext.commit(true);
try {Thread.sleep(1000);} catch (final InterruptedException e1) {}
continue tryloop;
List<SolrInputDocument> webgraph = vector.getWebgraphDocuments();
if (webgraph != null && webgraph.size() > 0) {
// write the edges to the webgraph solr index
if (this.fulltext.writeToWebgraph()) {
tryloop: for (int i = 0; i < 20; i++) {
try {
error = null;
this.fulltext.putEdges(webgraph);
break tryloop;
} catch (final IOException e ) {
error = "failed to send " + urlNormalform + " to solr: " + e.getMessage();
ConcurrentLog.warn("SOLR", error);
if (i == 10) this.fulltext.commit(true);
try {Thread.sleep(1000);} catch (final InterruptedException e1) {}
continue tryloop;
}
}
}
// write the edges to the citation reference index
if (this.connectedCitation()) try {
// normal links
for (SolrInputDocument edge: webgraph) {
String referrerhash = (String) edge.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
String anchorhash = (String) edge.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
if (referrerhash != null && anchorhash != null) {
urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
}
}
// media links as well!
for (AnchorURL image: document.getImages().keySet()) urlCitationIndex.add(image.hash(), new CitationReference(url.hash(), loadDate.getTime()));
for (AnchorURL audio: document.getAudiolinks().keySet()) urlCitationIndex.add(audio.hash(), new CitationReference(url.hash(), loadDate.getTime()));
for (AnchorURL video: document.getVideolinks().keySet()) urlCitationIndex.add(video.hash(), new CitationReference(url.hash(), loadDate.getTime()));
} catch (Throwable e) {
ConcurrentLog.logException(e);
}
}
if (error != null) {
ConcurrentLog.severe("SOLR", error + ", PLEASE REPORT TO bugs.yacy.net");
//Switchboard.getSwitchboard().pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, error);

@ -76,6 +76,7 @@ import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.RowHandleMap;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
@ -339,7 +340,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public SolrVector yacy2solr(
final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
final IndexCell<CitationReference> citations,
final WebgraphConfiguration webgraph, final String sourceName) {
// we use the SolrCell design as index schema
SolrVector doc = new SolrVector();
@ -353,7 +353,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String us = digestURL.toNormalform(true);
int clickdepth = 999;
if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) {
if ((allAttr || contains(CollectionSchema.clickdepth_i))) {
if (digestURL.probablyRootURL()) {
clickdepth = 0;
} else {
@ -818,7 +818,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// create a subgraph
if (!containsCanonical) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations, sourceName);
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), sourceName);
}
// list all links
@ -897,8 +897,40 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
// for each host, do a citation rank computation
for (String host: hostscore.keyList(true)) {
// Patch the citation index for links with canonical tags.
// This shall fulfill the following requirement:
// If a document A links to B and B contains a 'canonical C', then the citation rank coputation shall consider that A links to C and B does not link to C.
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(
CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]",
0, 10000000, 60000L, 50,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
SolrDocument doc_B;
try {
while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// find all documents which link to the canonical doc
DigestURL doc_C_url = new DigestURL((String) doc_B.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()));
byte[] doc_B_id = ASCII.getBytes(((String) doc_B.getFieldValue(CollectionSchema.id.getSolrFieldName())));
// we remove all references to B, because these become references to C
ReferenceContainer<CitationReference> doc_A_ids = segment.urlCitation().remove(doc_B_id);
if (doc_A_ids == null) {
//System.out.println("*** document with canonical but no referrer: " + doc_B.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
continue; // the document has a canonical tag but no referrer?
}
Iterator<CitationReference> doc_A_ids_iterator = doc_A_ids.entries();
// for each of the referrer A of B, set A as a referrer of C
while (doc_A_ids_iterator.hasNext()) {
CitationReference doc_A_citation = doc_A_ids_iterator.next();
segment.urlCitation().add(doc_C_url.hash(), doc_A_citation);
}
}
} catch (InterruptedException e) {
} catch (SpaceExceededException e) {
}
// do the citation rank computation
if (hostscore.get(host) <= 0) continue;
// select all documents for each host
CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6);

@ -55,8 +55,6 @@ import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.search.index.Segment;
public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
@ -117,7 +115,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links,
final IndexCell<CitationReference> citations, final String sourceName) {
final String sourceName) {
boolean allAttr = this.isEmpty();
int target_order = 0;
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
@ -268,7 +266,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && citations != null) {
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) {
if (target_url.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
add(edge, WebgraphSchema.target_clickdepth_i, 0);

Loading…
Cancel
Save