From 3c4c69adea242d3a61cf1e6a49d64ddb70973560 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 29 Jun 2015 02:02:01 +0200 Subject: [PATCH] fix for - bad regex computation for crawl start from file (limitation on domain did not work) - servlet error when starting crawl from a large list of urls --- htroot/Crawler_p.java | 62 ++++++++++++------- .../net/yacy/crawler/data/CrawlProfile.java | 15 +++-- 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 7f2862ba5..d8fbcd50d 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -254,11 +254,12 @@ public class Crawler_p { } else { crawlName = crawlingFile.getName(); } - if (crawlName.length() > 256) { + if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1); + if (crawlName.length() > 64) { + crawlName = "crawl_for_" + rootURLs.size() + "_start_points_" + Integer.toHexString(crawlName.hashCode()); int p = crawlName.lastIndexOf(','); if (p >= 8) crawlName = crawlName.substring(0, p); } - if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1); if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr; // delete old robots entries @@ -466,6 +467,40 @@ public class Crawler_p { int timezoneOffset = post.getInt("timezoneOffset", 0); + // in case that we crawl from a file, load that file and re-compute mustmatch pattern + List hyperlinks_from_file = null; + if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) { + final String crawlingFileContent = post.get("crawlingFile$file", ""); + try { + // check if the crawl filter works correctly + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset); + final Writer writer = new TransformerWriter(null, null, scraper, null, false); + if (crawlingFile != null && crawlingFile.exists()) { + FileUtils.copy(new FileInputStream(crawlingFile), writer); + } else { + FileUtils.copy(crawlingFileContent, writer); + } + writer.close(); + + // get links and generate filter + hyperlinks_from_file = scraper.getAnchors(); + if (newcrawlingdepth > 0) { + if (fullDomain) { + newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file); + } else if (subPath) { + newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file); + } + } + } catch (final Exception e) { + // mist + prop.put("info", "7"); // Error with file + prop.putHTML("info_crawlingStart", crawlingFileName); + prop.putHTML("info_error", e.getMessage()); + ConcurrentLog.logException(e); + } + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + } + // prepare a new crawling profile final CrawlProfile profile; byte[] handle; @@ -578,32 +613,17 @@ public class Crawler_p { ConcurrentLog.logException(e); } } else if ("file".equals(crawlingMode)) { - if (post.containsKey("crawlingFile") && crawlingFile != null) { - final String crawlingFileContent = post.get("crawlingFile$file", ""); + if (post.containsKey("crawlingFile") && crawlingFile != null && hyperlinks_from_file != null) { try { - // check if the crawl filter works correctly - Pattern.compile(newcrawlingMustMatch); - final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset); - final Writer writer = new TransformerWriter(null, null, scraper, null, false); - if (crawlingFile != null && crawlingFile.exists()) { - FileUtils.copy(new FileInputStream(crawlingFile), writer); - } else { - FileUtils.copy(crawlingFileContent, writer); - } - writer.close(); - - // get links and generate filter - final List hyperlinks = scraper.getAnchors(); if (newcrawlingdepth > 0) { if (fullDomain) { - newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks); + newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file); } else if (subPath) { - newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks); + newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file); } } - sb.crawler.putActive(handle, profile); - sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, profile.timezoneOffset()); + sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset()); } catch (final PatternSyntaxException e) { prop.put("info", "4"); // crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 0030790ee..ad1f8d9d8 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -691,11 +691,16 @@ public class CrawlProfile extends ConcurrentHashMap implements M } public static String siteFilter(final Collection urls) { - LinkedHashSet filters = new LinkedHashSet(); // first collect in a set to eliminate doubles - for (final MultiProtocolURL url: urls) filters.add(mustMatchFilterFullDomain(url)); final StringBuilder filter = new StringBuilder(); - for (final String urlfilter: filters) filter.append('|').append(urlfilter); - return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING; + filter.append("(smb|ftp|https?)://(www.)?("); + for (final MultiProtocolURL url: urls) { + String host = url.getHost(); + if (host == null) continue; + if (host.startsWith("www.")) host = host.substring(4); + filter.append(Pattern.quote(host.toLowerCase())).append(".*|"); + } + filter.setCharAt(filter.length() - 1, ')'); + return filter.toString(); } public static String mustMatchFilterFullDomain(final MultiProtocolURL url) { @@ -721,7 +726,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (host.startsWith("www.")) host = host.substring(4); String protocol = url.getProtocol(); if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+"; - return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(url.getPath()).append(".*").toString(); + return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase())).append(url.getPath()).append(".*").toString(); } public boolean isPushCrawlProfile() {