From ca5437dd508401e403209633a30bf176da602db9 Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 28 May 2014 03:01:34 +0200 Subject: [PATCH] fix crawl of file:// , also http://mantis.tokeek.de/view.php?id=149 local files can be crawled (intranet mode) url parsing fixed according to RFC 1738 (for unix and windows) for win like file:///c:/tmp or file://localhost/c:/tmp for linux like file:///tmp or file://localhost/tmp Host is ignored and path must be absolute --- htroot/Crawler_p.java | 10 ++++-- .../cora/document/id/MultiProtocolURL.java | 33 +++++++++---------- source/net/yacy/crawler/HostQueue.java | 5 ++- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index bf435f4f0..888ef9272 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -247,9 +247,13 @@ public class Crawler_p { if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr; // delete old robots entries - for (DigestURL ru: rootURLs) { + for (DigestURL ru : rootURLs) { sb.robots.delete(ru); - try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {} + try { + if (ru.getHost() != null) { // might be null for file:// + Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash()); + } + } catch (IOException e) {} } try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all. @@ -645,7 +649,7 @@ public class Crawler_p { prop.put("crawlProfilesShow_linkstructure", 0); if (count > 0) { // collect the host names for 'wide' crawls which can be visualized - boolean showLinkstructure = hosts.length() > 0; + boolean showLinkstructure = hosts.length() > 0 && !hosts.contains("file:"); if (showLinkstructure) { StringBuilder q = new StringBuilder(); hosts = hosts.substring(1); diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 8c5a01b11..840e7c2ee 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -211,26 +211,25 @@ public class MultiProtocolURL implements Serializable, Comparable 0 && h.charAt(0) == '/') { - final char c = h.charAt(2); - if (c == ':' || c == '|') - this.path = h.substring(1); - else - this.path = h; + this.path = h.substring(2); // "/path" or "/c:/path" + } else { // "//host/path" or "//host/c:/path" + int q = url.indexOf('/', p + 3); + if (q < 0) { + this.path = "/"; } else { - final char c = h.charAt(1); - if (c == ':' || c == '|') - this.path = h; - else - this.path = "/" + h; + this.path = url.substring(q); } } this.userInfo = null; diff --git a/source/net/yacy/crawler/HostQueue.java b/source/net/yacy/crawler/HostQueue.java index 9d58bce4a..2b1e050c0 100644 --- a/source/net/yacy/crawler/HostQueue.java +++ b/source/net/yacy/crawler/HostQueue.java @@ -78,7 +78,7 @@ public class HostQueue implements Balancer { final boolean exceed134217727) { this.onDemand = onDemand; this.exceed134217727 = exceed134217727; - this.hostName = hostName; + this.hostName = (hostName == null) ? "localhost" : hostName; // might be null (file://) but hostqueue needs a name (for queue file) this.port = port; this.hostPath = new File(hostsPath, this.hostName + "." + this.port); init(); @@ -101,6 +101,9 @@ public class HostQueue implements Balancer { private final void init() { try { + if (this.hostName == null) + this.hostHash=""; + else this.hostHash = DigestURL.hosthash(this.hostName, this.port); } catch (MalformedURLException e) { this.hostHash = "";