local files can be crawled (intranet mode) url parsing fixed according to  RFC 1738 (for unix and windows)
for win like file:///c:/tmp   or file://localhost/c:/tmp
for linux like file:///tmp  or file://localhost/tmp
Host is ignored and path must be absolute
pull/1/head
reger 11 years ago
parent 9b4282344b
commit ca5437dd50

@ -247,9 +247,13 @@ public class Crawler_p {
if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
// delete old robots entries
for (DigestURL ru: rootURLs) {
for (DigestURL ru : rootURLs) {
sb.robots.delete(ru);
try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {}
try {
if (ru.getHost() != null) { // might be null for file://
Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());
}
} catch (IOException e) {}
}
try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all.
@ -645,7 +649,7 @@ public class Crawler_p {
prop.put("crawlProfilesShow_linkstructure", 0);
if (count > 0) {
// collect the host names for 'wide' crawls which can be visualized
boolean showLinkstructure = hosts.length() > 0;
boolean showLinkstructure = hosts.length() > 0 && !hosts.contains("file:");
if (showLinkstructure) {
StringBuilder q = new StringBuilder();
hosts = hosts.substring(1);

@ -211,26 +211,25 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
this.searchpart = null;
this.anchor = null;
} else if (this.protocol.equals("file")) {
// parse file url
// parse file url (RFC 1738 file://host.domain/path file://localhost/path file:///path)
// example unix file://localhost/etc/fstab
// file:///etc/fstab
// example windows file://localhost/c|/WINDOWS/clock.avi
// file:///c|/WINDOWS/clock.avi
// file://localhost/c:/WINDOWS/clock.avi
// network file://hostname/path/to/the%20file.txt
// local file:///c:/path/to/the%20file.txt
final String h = url.substring(p + 1);
if (h.startsWith("//")) {
this.host = null; // host is ignored on file: protocol
if (h.startsWith("///")) { //absolute local file path
// no host given
this.host = null;
this.path = h.substring(2);
} else {
this.host = null;
if (h.length() > 0 && h.charAt(0) == '/') {
final char c = h.charAt(2);
if (c == ':' || c == '|')
this.path = h.substring(1);
else
this.path = h;
this.path = h.substring(2); // "/path" or "/c:/path"
} else { // "//host/path" or "//host/c:/path"
int q = url.indexOf('/', p + 3);
if (q < 0) {
this.path = "/";
} else {
final char c = h.charAt(1);
if (c == ':' || c == '|')
this.path = h;
else
this.path = "/" + h;
this.path = url.substring(q);
}
}
this.userInfo = null;

@ -78,7 +78,7 @@ public class HostQueue implements Balancer {
final boolean exceed134217727) {
this.onDemand = onDemand;
this.exceed134217727 = exceed134217727;
this.hostName = hostName;
this.hostName = (hostName == null) ? "localhost" : hostName; // might be null (file://) but hostqueue needs a name (for queue file)
this.port = port;
this.hostPath = new File(hostsPath, this.hostName + "." + this.port);
init();
@ -101,6 +101,9 @@ public class HostQueue implements Balancer {
private final void init() {
try {
if (this.hostName == null)
this.hostHash="";
else
this.hostHash = DigestURL.hosthash(this.hostName, this.port);
} catch (MalformedURLException e) {
this.hostHash = "";

Loading…
Cancel
Save