fix for crawl start filter

pull/1/head
Michael Peter Christen 12 years ago
parent abebb3b124
commit 53789555b9

@ -172,13 +172,6 @@ public class Crawler_p {
DigestURI crawlingStartURL = new DigestURI(crawlingStart);
rootURLs.add(crawlingStartURL);
crawlName += crawlingStartURL.getHost() + "_";
if (fullDomain) {
newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL);
if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*";
}
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
}
if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
} catch (MalformedURLException e) {
@ -318,16 +311,24 @@ public class Crawler_p {
if ((fullDomain || subPath) && newcrawlingdepth > 0) {
String siteFilter = ".*";
if (fullDomain) {
siteFilter = siteFilter(rootURLs);
siteFilter = CrawlProfile.siteFilter(rootURLs);
} else if (subPath) {
siteFilter = subpathFilter(rootURLs);
siteFilter = CrawlProfile.subpathFilter(rootURLs);
}
if (CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) {
newcrawlingMustMatch = siteFilter;
} else if (!CrawlProfile.MATCH_ALL_STRING.equals(siteFilter)) {
// combine both
newcrawlingMustMatch = "(" + newcrawlingMustMatch + ")|(" + siteFilter + ")";
}
newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch) ? siteFilter : "(?=(" + newcrawlingMustMatch + "))(" + siteFilter + ")";
}
// check if the crawl filter works correctly
try {
Pattern.compile(newcrawlingMustMatch);
Pattern mmp = Pattern.compile(newcrawlingMustMatch);
for (DigestURI u: rootURLs) {
assert mmp.matcher(u.toNormalform(true, true)).matches() : "pattern " + mmp.toString() + " does not match url " + u.toNormalform(true, true);
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
@ -470,9 +471,9 @@ public class Crawler_p {
final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
if (newcrawlingdepth > 0) {
if (fullDomain) {
newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks.keySet());
} else if (subPath) {
newcrawlingMustMatch = subpathFilter(hyperlinks.keySet());
newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks.keySet());
}
}
@ -650,30 +651,4 @@ public class Crawler_p {
sb.setPerformance(wantedPPM);
}
private static String siteFilter(final Set<? extends MultiProtocolURI> uris) {
final StringBuilder filter = new StringBuilder();
final Set<String> filterSet = new HashSet<String>();
for (final MultiProtocolURI uri: uris) {
filterSet.add(new StringBuilder().append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*").toString());
if (!uri.getHost().startsWith("www.")) {
filterSet.add(new StringBuilder().append(uri.getProtocol()).append("://www.").append(uri.getHost()).append(".*").toString());
}
}
for (final String element : filterSet) {
filter.append('|').append(element);
}
return filter.length() > 0 ? filter.substring(1) : "";
}
private static String subpathFilter(final Set<? extends MultiProtocolURI> uris) {
final StringBuilder filter = new StringBuilder();
final Set<String> filterSet = new HashSet<String>();
for (final MultiProtocolURI uri: uris) {
filterSet.add(new StringBuilder().append(uri.toNormalform(true, false)).append(".*").toString());
}
for (final String element : filterSet) {
filter.append('|').append(element);
}
return filter.length() > 0 ? filter.substring(1) : "";
}
}

@ -527,23 +527,31 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}
public static String mustMatchFilterFullDomain(final MultiProtocolURI crawlingStartURL) {
if (crawlingStartURL.isFile()) {
return "file://" + crawlingStartURL.getPath() + ".*";
} else if (crawlingStartURL.isSMB()) {
return "smb://" + crawlingStartURL.getHost() + ".*";
} else if (crawlingStartURL.isFTP()) {
return "ftp://" + crawlingStartURL.getHost() + ".*";
} else {
final String host = crawlingStartURL.getHost();
if (host.startsWith("www.")) {
return "https?://" + crawlingStartURL.getHost() + ".*";
}
// if the www is not given we accept that also
return "https?://(?:www.)?" + crawlingStartURL.getHost() + ".*";
}
public static String mustMatchFilterFullDomain(final MultiProtocolURI uri) {
String host = uri.getHost();
if (host.startsWith("www.")) host = host.substring(4);
String protocol = uri.getProtocol();
if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(".*").toString();
}
public static String mustMatchSubpath(final MultiProtocolURI uri) {
String u = uri.toNormalform(true, true);
if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);}
return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
}
public static String siteFilter(final Set<? extends MultiProtocolURI> uris) {
final StringBuilder filter = new StringBuilder();
for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri));
return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
}
public static String subpathFilter(final Set<? extends MultiProtocolURI> uris) {
final StringBuilder filter = new StringBuilder();
for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchSubpath(uri));
return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
}
public static final Set<String> ignoreNames = new HashSet<String>();
static {

Loading…
Cancel
Save