adjust MultiProtocolURL.protocol detection to handle mailto with "://" in parameters,

and feeding hyperlinks to webgraph processing.
pull/37/head
reger 9 years ago
parent 67f64af4b4
commit 45b9bd8403

@ -377,7 +377,7 @@ public class Crawler_p {
try {
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, agent);
// get links and generate filter
for (DigestURL u: scraper.getAnchors()) {
for (DigestURL u: scraper.getHyperlinks().keySet()) {
newRootURLs.add(u);
}
} catch (final IOException e) {

@ -202,7 +202,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
url = "file://" + url;
}
int p = url.indexOf("://");
int p = url.lastIndexOf("://",5); // lastindexof to look only at the begin of url, up to "https://",
if (p < 0) {
if (url.length() > 7 && url.substring(0,7).equalsIgnoreCase("mailto:")) {
p = 6;

@ -430,6 +430,11 @@ dc_rights
return sentences;
}
/**
* All anchor links of the document
* (this includes mailto links)
* @return all links embedded as anchors (clickeable entities)
*/
public Collection<AnchorURL> getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
@ -445,6 +450,11 @@ dc_rights
// the next three methods provide a calculated view on the getAnchors/getImages:
/**
* List of links to resources (pages, images, files, media ...)
* (Hyperlinks do not include mailto: links)
* @return a subset of the getAnchor-set: only links to other hyperrefs
*/
public Map<AnchorURL, String> getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (!this.resorted) resortLinks();

@ -965,7 +965,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Boolean canonical_equal_sku = canonical == null ? null : canonical.toNormalform(true).equals(url);
if (webgraph != null && (!containsCanonical || (canonical_equal_sku != null && (canonical_equal_sku.booleanValue())))) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
List<SolrInputDocument> edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, processTypes, document.getAnchors(), sourceName);
List<SolrInputDocument> edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, processTypes, document.getHyperlinks().keySet(), sourceName);
// this also enriched the subgraph
doc.webgraphDocuments.addAll(edges);
} else {
@ -976,7 +976,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
contains(CollectionSchema.outboundlinks_protocol_sxt) ||
contains(CollectionSchema.outboundlinks_urlstub_sxt) ||
contains(CollectionSchema.outboundlinks_anchortext_txt)) {
for (final AnchorURL target_url: document.getAnchors()) {
for (final AnchorURL target_url: document.getHyperlinks().keySet()) {
enrichSubgraph(subgraph, digestURL, target_url);
}
}

Loading…
Cancel
Save