local files can be crawled (intranet mode) url parsing fixed according to  RFC 1738 (for unix and windows)
for win like file:///c:/tmp   or file://localhost/c:/tmp
for linux like file:///tmp  or file://localhost/tmp
Host is ignored and path must be absolute
pull/1/head
reger 11 years ago
parent 9b4282344b
commit ca5437dd50

@ -247,9 +247,13 @@ public class Crawler_p {
if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr; if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
// delete old robots entries // delete old robots entries
for (DigestURL ru: rootURLs) { for (DigestURL ru : rootURLs) {
sb.robots.delete(ru); sb.robots.delete(ru);
try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {} try {
if (ru.getHost() != null) { // might be null for file://
Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());
}
} catch (IOException e) {}
} }
try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all. try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all.
@ -645,7 +649,7 @@ public class Crawler_p {
prop.put("crawlProfilesShow_linkstructure", 0); prop.put("crawlProfilesShow_linkstructure", 0);
if (count > 0) { if (count > 0) {
// collect the host names for 'wide' crawls which can be visualized // collect the host names for 'wide' crawls which can be visualized
boolean showLinkstructure = hosts.length() > 0; boolean showLinkstructure = hosts.length() > 0 && !hosts.contains("file:");
if (showLinkstructure) { if (showLinkstructure) {
StringBuilder q = new StringBuilder(); StringBuilder q = new StringBuilder();
hosts = hosts.substring(1); hosts = hosts.substring(1);

@ -211,26 +211,25 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
this.searchpart = null; this.searchpart = null;
this.anchor = null; this.anchor = null;
} else if (this.protocol.equals("file")) { } else if (this.protocol.equals("file")) {
// parse file url // parse file url (RFC 1738 file://host.domain/path file://localhost/path file:///path)
// example unix file://localhost/etc/fstab
// file:///etc/fstab
// example windows file://localhost/c|/WINDOWS/clock.avi
// file:///c|/WINDOWS/clock.avi
// file://localhost/c:/WINDOWS/clock.avi
// network file://hostname/path/to/the%20file.txt
// local file:///c:/path/to/the%20file.txt
final String h = url.substring(p + 1); final String h = url.substring(p + 1);
if (h.startsWith("//")) { this.host = null; // host is ignored on file: protocol
if (h.startsWith("///")) { //absolute local file path
// no host given // no host given
this.host = null; this.path = h.substring(2); // "/path" or "/c:/path"
this.path = h.substring(2); } else { // "//host/path" or "//host/c:/path"
} else { int q = url.indexOf('/', p + 3);
this.host = null; if (q < 0) {
if (h.length() > 0 && h.charAt(0) == '/') { this.path = "/";
final char c = h.charAt(2);
if (c == ':' || c == '|')
this.path = h.substring(1);
else
this.path = h;
} else { } else {
final char c = h.charAt(1); this.path = url.substring(q);
if (c == ':' || c == '|')
this.path = h;
else
this.path = "/" + h;
} }
} }
this.userInfo = null; this.userInfo = null;

@ -78,7 +78,7 @@ public class HostQueue implements Balancer {
final boolean exceed134217727) { final boolean exceed134217727) {
this.onDemand = onDemand; this.onDemand = onDemand;
this.exceed134217727 = exceed134217727; this.exceed134217727 = exceed134217727;
this.hostName = hostName; this.hostName = (hostName == null) ? "localhost" : hostName; // might be null (file://) but hostqueue needs a name (for queue file)
this.port = port; this.port = port;
this.hostPath = new File(hostsPath, this.hostName + "." + this.port); this.hostPath = new File(hostsPath, this.hostName + "." + this.port);
init(); init();
@ -101,6 +101,9 @@ public class HostQueue implements Balancer {
private final void init() { private final void init() {
try { try {
if (this.hostName == null)
this.hostHash="";
else
this.hostHash = DigestURL.hosthash(this.hostName, this.port); this.hostHash = DigestURL.hosthash(this.hostName, this.port);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
this.hostHash = ""; this.hostHash = "";

Loading…
Cancel
Save