adjust MultiProtocolURL.protocol detection to handle mailto with "://" in parameters,

and feeding hyperlinks to webgraph processing.
9 years ago · 45b9bd8403
parent 67f64af4b4
commit 45b9bd8403
4 changed files with 14 additions and 4 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -377,7 +377,7 @@ public class Crawler_p {
                        try {
                            scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, agent);
                            // get links and generate filter
-                            for (DigestURL u: scraper.getAnchors()) {
+                            for (DigestURL u: scraper.getHyperlinks().keySet()) {
                                newRootURLs.add(u);
                            }
                        } catch (final IOException e) {
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -202,7 +202,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
            url = "file://" + url;
        }

-        int p = url.indexOf("://");
+        int p = url.lastIndexOf("://",5); // lastindexof to look only at the begin of url, up to "https://",
        if (p < 0) {
            if (url.length() > 7 && url.substring(0,7).equalsIgnoreCase("mailto:")) {
                p = 6;
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -430,6 +430,11 @@ dc_rights
        return sentences;
    }

+    /**
+     * All anchor links of the document
+     * (this includes mailto links)
+     * @return all links embedded as anchors (clickeable entities)
+     */
    public Collection<AnchorURL> getAnchors() {
        // returns all links embedded as anchors (clickeable entities)
        // this is a url(String)/text(String) map
@ -445,6 +450,11 @@ dc_rights

    // the next three methods provide a calculated view on the getAnchors/getImages:

+    /**
+     * List of links to resources (pages, images, files, media ...)
+     * (Hyperlinks do not include mailto: links)
+     * @return a subset of the getAnchor-set: only links to other hyperrefs
+     */
    public Map<AnchorURL, String> getHyperlinks() {
        // this is a subset of the getAnchor-set: only links to other hyperrefs
        if (!this.resorted) resortLinks();
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -965,7 +965,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        Boolean canonical_equal_sku = canonical == null ? null : canonical.toNormalform(true).equals(url);  
        if (webgraph != null && (!containsCanonical || (canonical_equal_sku != null && (canonical_equal_sku.booleanValue())))) {
            // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
-            List<SolrInputDocument> edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, processTypes, document.getAnchors(), sourceName);
+            List<SolrInputDocument> edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, processTypes, document.getHyperlinks().keySet(), sourceName);
            // this also enriched the subgraph
            doc.webgraphDocuments.addAll(edges);
        } else {
@ -976,7 +976,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                contains(CollectionSchema.outboundlinks_protocol_sxt) ||
                contains(CollectionSchema.outboundlinks_urlstub_sxt) ||
                contains(CollectionSchema.outboundlinks_anchortext_txt)) {
-                for (final AnchorURL target_url: document.getAnchors()) {
+                for (final AnchorURL target_url: document.getHyperlinks().keySet()) {
                    enrichSubgraph(subgraph, digestURL, target_url);
                }
            }