added creation of subpath pattern when crawl start is 'from file'

pull/1/head
Michael Peter Christen 13 years ago
parent 0cbda0b2b8
commit e3aa05b9dd

@ -466,7 +466,13 @@ public class Crawler_p {
// get links and generate filter
final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
if (newcrawlingdepth > 0) {
if (fullDomain) {
newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
} else if (subPath) {
newcrawlingMustMatch = subpathFilter(hyperlinks.keySet());
}
}
final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
final CrawlProfile profile = new CrawlProfile(
@ -681,4 +687,16 @@ public class Crawler_p {
}
return filter.length() > 0 ? filter.substring(1) : "";
}
private static String subpathFilter(final Set<MultiProtocolURI> uris) {
final StringBuilder filter = new StringBuilder();
final Set<String> filterSet = new HashSet<String>();
for (final MultiProtocolURI uri: uris) {
filterSet.add(new StringBuilder().append(uri.toNormalform(true, false)).append(".*").toString());
}
for (final String element : filterSet) {
filter.append('|').append(element);
}
return filter.length() > 0 ? filter.substring(1) : "";
}
}

Loading…
Cancel
Save