fixed default must-match filter for full domain crawls - the old filter

was to restrictive and did not allow intranet crawls
pull/1/head
Michael Christen 13 years ago
parent 3e61287326
commit 22f05c83ff

@ -482,16 +482,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (crawlingStartURL.isFile()) {
return "file://" + crawlingStartURL.getPath() + ".*";
} else if (crawlingStartURL.isSMB()) {
return "smb://" + crawlingStartURL.getHost() + "(?:/|$)+.*";
return "smb://" + crawlingStartURL.getHost() + ".*";
} else if (crawlingStartURL.isFTP()) {
return "ftp://" + crawlingStartURL.getHost() + "(?:/|$)+.*";
return "ftp://" + crawlingStartURL.getHost() + ".*";
} else {
final String host = crawlingStartURL.getHost();
if (host.startsWith("www.")) {
return "https?://" + crawlingStartURL.getHost() + "(?:/|$)+.*";
return "https?://" + crawlingStartURL.getHost() + ".*";
} else {
// if the www is not given we accept that also
return "https?://(?:www.)?" + crawlingStartURL.getHost() + "(?:/|$)+.*";
return "https?://(?:www.)?" + crawlingStartURL.getHost() + ".*";
}
}
}

@ -414,13 +414,13 @@ public final class CrawlStacker {
// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return "url does not match must-match filter";
return "url does not match must-match filter " + profile.urlMustMatchPattern().toString();
}
// filter with must-not-match for URLs
if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return "url matches must-not-match filter";
return "url matches must-not-match filter " + profile.urlMustNotMatchPattern().toString();
}
// deny cgi

Loading…
Cancel
Save