From 654801523e73ce04db347cea35b092278841a711 Mon Sep 17 00:00:00 2001 From: luccioman Date: Tue, 9 May 2017 18:32:47 +0200 Subject: [PATCH] Fixed StringIndexOutOfBoundsException case. Revealed by commit c77e43a : the exception was then thrown when indexing pages containing mailto: scheme URL links with the Solr Webgraph core enabled. Fixed the error case and restored filtering on mailto links in Document.resortLinks() as these URLs still should not appear in Document.hyperlinks. --- source/net/yacy/document/Document.java | 4 ++++ .../net/yacy/search/schema/WebgraphConfiguration.java | 10 ++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 19337c593..b2f6f003b 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -560,6 +560,10 @@ dc_rights if (url == null) continue; u = url.toNormalform(true); final String name = url.getNameProperty(); + // check mailto scheme first (not supposed to get into in/outboundlinks or hyperlinks -> crawler can't process) + if (url.getProtocol().equals("mailto")) { + continue; + } final boolean noindex = url.getRelProperty().toLowerCase().indexOf("noindex",0) >= 0; final boolean nofollow = url.getRelProperty().toLowerCase().indexOf("nofollow",0) >= 0; diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index 6dcd31c46..c589c0861 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -157,9 +157,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial // add the source attributes add(edge, WebgraphSchema.source_id_s, source_id); - int pr_source = source_url_string.indexOf("://",0); - if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source)); - if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3)); + if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url.getProtocol()); + if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url.urlstub(true, true)); Map source_searchpart = source_url.getSearchpartMap(); if (source_searchpart == null) { if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0); @@ -217,9 +216,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial // add the target attributes add(edge, WebgraphSchema.target_id_s, target_id); final String target_url_string = target_url.toNormalform(false); - int pr_target = target_url_string.indexOf("://",0); - if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target)); - if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3)); + if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url.getProtocol()); + if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url.urlstub(true, true)); Map target_searchpart = target_url.getSearchpartMap(); if (target_searchpart == null) { if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0);