From 45b9bd84038995f87ca93e71766e77d2a968a856 Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 21 Dec 2015 04:42:26 +0100 Subject: [PATCH] adjust MultiProtocolURL.protocol detection to handle mailto with "://" in parameters, and feeding hyperlinks to webgraph processing. --- htroot/Crawler_p.java | 2 +- source/net/yacy/cora/document/id/MultiProtocolURL.java | 2 +- source/net/yacy/document/Document.java | 10 ++++++++++ .../yacy/search/schema/CollectionConfiguration.java | 4 ++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 7b5d705ba..4f440a0ac 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -377,7 +377,7 @@ public class Crawler_p { try { scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, agent); // get links and generate filter - for (DigestURL u: scraper.getAnchors()) { + for (DigestURL u: scraper.getHyperlinks().keySet()) { newRootURLs.add(u); } } catch (final IOException e) { diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 681c96538..24907b8d3 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -202,7 +202,7 @@ public class MultiProtocolURL implements Serializable, Comparable 7 && url.substring(0,7).equalsIgnoreCase("mailto:")) { p = 6; diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index d58a2bca5..265cbf45a 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -430,6 +430,11 @@ dc_rights return sentences; } + /** + * All anchor links of the document + * (this includes mailto links) + * @return all links embedded as anchors (clickeable entities) + */ public Collection getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map @@ -445,6 +450,11 @@ dc_rights // the next three methods provide a calculated view on the getAnchors/getImages: + /** + * List of links to resources (pages, images, files, media ...) + * (Hyperlinks do not include mailto: links) + * @return a subset of the getAnchor-set: only links to other hyperrefs + */ public Map getHyperlinks() { // this is a subset of the getAnchor-set: only links to other hyperrefs if (!this.resorted) resortLinks(); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 047aee7a0..7094a1a51 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -965,7 +965,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Boolean canonical_equal_sku = canonical == null ? null : canonical.toNormalform(true).equals(url); if (webgraph != null && (!containsCanonical || (canonical_equal_sku != null && (canonical_equal_sku.booleanValue())))) { // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document - List edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, processTypes, document.getAnchors(), sourceName); + List edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, processTypes, document.getHyperlinks().keySet(), sourceName); // this also enriched the subgraph doc.webgraphDocuments.addAll(edges); } else { @@ -976,7 +976,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri contains(CollectionSchema.outboundlinks_protocol_sxt) || contains(CollectionSchema.outboundlinks_urlstub_sxt) || contains(CollectionSchema.outboundlinks_anchortext_txt)) { - for (final AnchorURL target_url: document.getAnchors()) { + for (final AnchorURL target_url: document.getHyperlinks().keySet()) { enrichSubgraph(subgraph, digestURL, target_url); } }