From e3aa05b9dde805d5c501caff7434fb559ac050b0 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 11 Jul 2012 23:18:57 +0200 Subject: [PATCH] added creation of subpath pattern when crawl start is 'from file' --- htroot/Crawler_p.java | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index f0f6b9d7b..1b29d3cd3 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -466,7 +466,13 @@ public class Crawler_p { // get links and generate filter final Map hyperlinks = scraper.getAnchors(); - if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet()); + if (newcrawlingdepth > 0) { + if (fullDomain) { + newcrawlingMustMatch = siteFilter(hyperlinks.keySet()); + } else if (subPath) { + newcrawlingMustMatch = subpathFilter(hyperlinks.keySet()); + } + } final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString()); final CrawlProfile profile = new CrawlProfile( @@ -681,4 +687,16 @@ public class Crawler_p { } return filter.length() > 0 ? filter.substring(1) : ""; } + + private static String subpathFilter(final Set uris) { + final StringBuilder filter = new StringBuilder(); + final Set filterSet = new HashSet(); + for (final MultiProtocolURI uri: uris) { + filterSet.add(new StringBuilder().append(uri.toNormalform(true, false)).append(".*").toString()); + } + for (final String element : filterSet) { + filter.append('|').append(element); + } + return filter.length() > 0 ? filter.substring(1) : ""; + } }