From c2f62e783f3b60b7391b9847b97ad076115b7c26 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 17 Apr 2014 12:54:18 +0200 Subject: [PATCH] - better subgraph handling, less overhead for crawls without the webgraph - usage of crawler crawldepth cache for the linkgraph target depth computation --- .../schema/CollectionConfiguration.java | 52 +++++++++++++++++-- .../search/schema/WebgraphConfiguration.java | 44 +++++++--------- 2 files changed, 66 insertions(+), 30 deletions(-) diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index fce4aefd5..1fc0b9d31 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -45,9 +45,11 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; + import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailType; @@ -85,7 +87,7 @@ import net.yacy.search.index.Segment; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.query.QueryParams; -import net.yacy.search.schema.WebgraphConfiguration.Subgraph; + import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; @@ -332,6 +334,35 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } + public static class Subgraph { + public final ArrayList[] urlProtocols, urlStubs, urlAnchorTexts; + @SuppressWarnings("unchecked") + public Subgraph(int inboundSize, int outboundSize) { + this.urlProtocols = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + this.urlStubs = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + this.urlAnchorTexts = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + } + } + + public static boolean enrichSubgraph(final Subgraph subgraph, final DigestURL source_url, AnchorURL target_url) { + final String text = target_url.getTextProperty(); // the text between the tag + String source_host = source_url.getHost(); + String target_host = target_url.getHost(); + boolean inbound = + (source_host == null && target_host == null) || + (source_host != null && target_host != null && + (target_host.equals(source_host) || + target_host.equals("www." + source_host) || + source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here. + final String target_url_string = target_url.toNormalform(false); + int pr_target = target_url_string.indexOf("://",0); + int ioidx = inbound ? 0 : 1; + subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target)); + subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3)); + subgraph.urlAnchorTexts[ioidx].add(text); + return inbound; + } + /** * a SolrVector is a SolrInputDocument with the ability * to store also the webgraph that is associated with @@ -845,11 +876,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // create a subgraph if (!containsCanonical && webgraph != null) { // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document - webgraph.addEdges(subgraph, digestURL, responseHeader, collections, crawldepth, images, document.getAnchors(), sourceName); + List edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, images, document.getAnchors(), sourceName); + // this also enriched the subgraph + doc.webgraphDocuments.addAll(edges); + } else { + if (allAttr || + contains(CollectionSchema.inboundlinks_protocol_sxt) || + contains(CollectionSchema.inboundlinks_urlstub_sxt) || + contains(CollectionSchema.inboundlinks_anchortext_txt) || + contains(CollectionSchema.outboundlinks_protocol_sxt) || + contains(CollectionSchema.outboundlinks_urlstub_sxt) || + contains(CollectionSchema.outboundlinks_anchortext_txt)) { + for (final AnchorURL target_url: document.getAnchors()) { + enrichSubgraph(subgraph, digestURL, target_url); + } + } } - // list all links - doc.webgraphDocuments.addAll(subgraph.edges); + // attach the subgraph content if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]); if (allAttr || contains(CollectionSchema.inboundlinks_anchortext_txt)) add(doc, CollectionSchema.inboundlinks_anchortext_txt, subgraph.urlAnchorTexts[0]); diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index f04aa0230..a32d7b27d 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -51,9 +51,11 @@ import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.HostBalancer; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; +import net.yacy.search.schema.CollectionConfiguration.Subgraph; public class WebgraphConfiguration extends SchemaConfiguration implements Serializable { @@ -98,19 +100,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } } - public static class Subgraph { - public final ArrayList[] urlProtocols, urlStubs, urlAnchorTexts; - public final ArrayList edges; - @SuppressWarnings("unchecked") - public Subgraph(int inboundSize, int outboundSize) { - this.urlProtocols = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; - this.urlStubs = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; - this.urlAnchorTexts = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; - this.edges = new ArrayList(inboundSize + outboundSize); - } - } - - public void addEdges( + public List getEdges( final Subgraph subgraph, final DigestURL source, final ResponseHeader responseHeader, Map collections, int crawldepth_source, final List images, final Collection links, @@ -118,14 +108,16 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial boolean allAttr = this.isEmpty(); boolean generalNofollow = responseHeader == null ? false : responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0; int target_order = 0; + List edges = new ArrayList(); for (final AnchorURL target_url: links) { SolrInputDocument edge = getEdge( subgraph, source, responseHeader, collections, crawldepth_source, images, sourceName, allAttr, generalNofollow, target_order, target_url); target_order++; // add the edge to the subgraph - subgraph.edges.add(edge); + edges.add(edge); } + return edges; } public SolrInputDocument getEdge( @@ -140,13 +132,6 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial String rel = target_url.getRelProperty(); // the rel-attribute String source_host = source_url.getHost(); String target_host = target_url.getHost(); - boolean inbound = - (source_host == null && target_host == null) || - (source_host != null && target_host != null && - (target_host.equals(source_host) || - target_host.equals("www." + source_host) || - source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here. - int ioidx = inbound ? 0 : 1; if (generalNofollow) { // patch the rel attribute since the header makes nofollow valid for all links if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; @@ -223,10 +208,11 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial // parse text to find images and clear text ContentScraper textContent = null; - try {textContent = htmlParser.parseToScraper(source_url, null, text, 10);} catch (IOException e) {} + try {textContent = htmlParser.parseToScraper(source_url, responseHeader.getCharacterEncoding(), text, 10);} catch (IOException e) {} String extractedText = textContent.getText(); // add the source attributes about the target + boolean inbound = CollectionConfiguration.enrichSubgraph(subgraph, source_url, target_url); if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound); if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : ""); if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : ""); @@ -248,9 +234,6 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial add(edge, WebgraphSchema.target_id_s, target_id); final String target_url_string = target_url.toNormalform(false); int pr_target = target_url_string.indexOf("://",0); - subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target)); - subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3)); - subgraph.urlAnchorTexts[ioidx].add(text); if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target)); if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3)); Map target_searchpart = target_url.getSearchpartMap(); @@ -289,7 +272,16 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } if ((allAttr || contains(WebgraphSchema.target_crawldepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { - add(edge, WebgraphSchema.target_crawldepth_i, 999); + if (target_host.equals(source_host)) { + // get the crawl depth from the crawler directly + Long targetdepth = HostBalancer.depthCache.get(target_url.hash()); + // if the depth is not known yet then this link configuration implies that it is on the next crawl level + add(edge, WebgraphSchema.target_crawldepth_i, targetdepth == null ? crawldepth_source + 1 : targetdepth.intValue()); + } else { + // if the target host is not the same as the source host, the interpretation of the crawl depth as the click depth fails + // in this case we mark the target depth with a special value for that case, 1111 + add(edge, WebgraphSchema.target_crawldepth_i, 1111); + } } if (allAttr || contains(WebgraphSchema.process_sxt)) {