- better subgraph handling, less overhead for crawls without the

webgraph
- usage of crawler crawldepth cache for the linkgraph target depth
computation
pull/1/head
Michael Peter Christen 11 years ago
parent 06afb568e2
commit c2f62e783f

@ -45,9 +45,11 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailType;
@ -85,7 +87,7 @@ import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
@ -332,6 +334,35 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public static class Subgraph {
public final ArrayList<String>[] urlProtocols, urlStubs, urlAnchorTexts;
@SuppressWarnings("unchecked")
public Subgraph(int inboundSize, int outboundSize) {
this.urlProtocols = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.urlStubs = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.urlAnchorTexts = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
}
}
public static boolean enrichSubgraph(final Subgraph subgraph, final DigestURL source_url, AnchorURL target_url) {
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
String source_host = source_url.getHost();
String target_host = target_url.getHost();
boolean inbound =
(source_host == null && target_host == null) ||
(source_host != null && target_host != null &&
(target_host.equals(source_host) ||
target_host.equals("www." + source_host) ||
source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here.
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
int ioidx = inbound ? 0 : 1;
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
subgraph.urlAnchorTexts[ioidx].add(text);
return inbound;
}
/**
* a SolrVector is a SolrInputDocument with the ability
* to store also the webgraph that is associated with
@ -845,11 +876,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// create a subgraph
if (!containsCanonical && webgraph != null) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, crawldepth, images, document.getAnchors(), sourceName);
List<SolrInputDocument> edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, images, document.getAnchors(), sourceName);
// this also enriched the subgraph
doc.webgraphDocuments.addAll(edges);
} else {
if (allAttr ||
contains(CollectionSchema.inboundlinks_protocol_sxt) ||
contains(CollectionSchema.inboundlinks_urlstub_sxt) ||
contains(CollectionSchema.inboundlinks_anchortext_txt) ||
contains(CollectionSchema.outboundlinks_protocol_sxt) ||
contains(CollectionSchema.outboundlinks_urlstub_sxt) ||
contains(CollectionSchema.outboundlinks_anchortext_txt)) {
for (final AnchorURL target_url: document.getAnchors()) {
enrichSubgraph(subgraph, digestURL, target_url);
}
}
}
// list all links
doc.webgraphDocuments.addAll(subgraph.edges);
// attach the subgraph content
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]);
if (allAttr || contains(CollectionSchema.inboundlinks_anchortext_txt)) add(doc, CollectionSchema.inboundlinks_anchortext_txt, subgraph.urlAnchorTexts[0]);

@ -51,9 +51,11 @@ import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.HostBalancer;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.search.schema.CollectionConfiguration.Subgraph;
public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
@ -98,19 +100,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
}
public static class Subgraph {
public final ArrayList<String>[] urlProtocols, urlStubs, urlAnchorTexts;
public final ArrayList<SolrInputDocument> edges;
@SuppressWarnings("unchecked")
public Subgraph(int inboundSize, int outboundSize) {
this.urlProtocols = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.urlStubs = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.urlAnchorTexts = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.edges = new ArrayList<SolrInputDocument>(inboundSize + outboundSize);
}
}
public void addEdges(
public List<SolrInputDocument> getEdges(
final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source,
final List<ImageEntry> images, final Collection<AnchorURL> links,
@ -118,14 +108,16 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
boolean allAttr = this.isEmpty();
boolean generalNofollow = responseHeader == null ? false : responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
int target_order = 0;
List<SolrInputDocument> edges = new ArrayList<SolrInputDocument>();
for (final AnchorURL target_url: links) {
SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, crawldepth_source, images,
sourceName, allAttr, generalNofollow, target_order, target_url);
target_order++;
// add the edge to the subgraph
subgraph.edges.add(edge);
edges.add(edge);
}
return edges;
}
public SolrInputDocument getEdge(
@ -140,13 +132,6 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
String rel = target_url.getRelProperty(); // the rel-attribute
String source_host = source_url.getHost();
String target_host = target_url.getHost();
boolean inbound =
(source_host == null && target_host == null) ||
(source_host != null && target_host != null &&
(target_host.equals(source_host) ||
target_host.equals("www." + source_host) ||
source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here.
int ioidx = inbound ? 0 : 1;
if (generalNofollow) {
// patch the rel attribute since the header makes nofollow valid for all links
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
@ -223,10 +208,11 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
// parse text to find images and clear text
ContentScraper textContent = null;
try {textContent = htmlParser.parseToScraper(source_url, null, text, 10);} catch (IOException e) {}
try {textContent = htmlParser.parseToScraper(source_url, responseHeader.getCharacterEncoding(), text, 10);} catch (IOException e) {}
String extractedText = textContent.getText();
// add the source attributes about the target
boolean inbound = CollectionConfiguration.enrichSubgraph(subgraph, source_url, target_url);
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
@ -248,9 +234,6 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.target_id_s, target_id);
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
subgraph.urlAnchorTexts[ioidx].add(text);
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
Map<String, String> target_searchpart = target_url.getSearchpartMap();
@ -289,7 +272,16 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
if ((allAttr || contains(WebgraphSchema.target_crawldepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
add(edge, WebgraphSchema.target_crawldepth_i, 999);
if (target_host.equals(source_host)) {
// get the crawl depth from the crawler directly
Long targetdepth = HostBalancer.depthCache.get(target_url.hash());
// if the depth is not known yet then this link configuration implies that it is on the next crawl level
add(edge, WebgraphSchema.target_crawldepth_i, targetdepth == null ? crawldepth_source + 1 : targetdepth.intValue());
} else {
// if the target host is not the same as the source host, the interpretation of the crawl depth as the click depth fails
// in this case we mark the target depth with a special value for that case, 1111
add(edge, WebgraphSchema.target_crawldepth_i, 1111);
}
}
if (allAttr || contains(WebgraphSchema.process_sxt)) {

Loading…
Cancel
Save