From 3c4c69adea242d3a61cf1e6a49d64ddb70973560 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 29 Jun 2015 02:02:01 +0200
Subject: [PATCH] fix for - bad regex computation for crawl start from file
 (limitation on domain did not work) - servlet error when starting crawl from
 a large list of urls

---
 htroot/Crawler_p.java                         | 62 ++++++++++++-------
 .../net/yacy/crawler/data/CrawlProfile.java   | 15 +++--
 2 files changed, 51 insertions(+), 26 deletions(-)
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 7f2862ba5..d8fbcd50d 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -254,11 +254,12 @@ public class Crawler_p {
                 } else {
                 	crawlName = crawlingFile.getName();
                 }
-                if (crawlName.length() > 256) {
+                if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
+                if (crawlName.length() > 64) {
+                    crawlName = "crawl_for_" + rootURLs.size() + "_start_points_" + Integer.toHexString(crawlName.hashCode());
                     int p = crawlName.lastIndexOf(',');
                     if (p >= 8) crawlName = crawlName.substring(0, p);
                 }
-                if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
                 if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
                 
                 // delete old robots entries
@@ -466,6 +467,40 @@ public class Crawler_p {
                 
                 int timezoneOffset = post.getInt("timezoneOffset", 0);
                 
+                // in case that we crawl from a file, load that file and re-compute mustmatch pattern
+                List<AnchorURL> hyperlinks_from_file = null;
+                if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) {
+                    final String crawlingFileContent = post.get("crawlingFile$file", "");
+                    try {
+                        // check if the crawl filter works correctly
+                        final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
+                        final Writer writer = new TransformerWriter(null, null, scraper, null, false);
+                        if (crawlingFile != null && crawlingFile.exists()) {
+                            FileUtils.copy(new FileInputStream(crawlingFile), writer);
+                        } else {
+                            FileUtils.copy(crawlingFileContent, writer);
+                        }
+                        writer.close();
+
+                        // get links and generate filter
+                        hyperlinks_from_file = scraper.getAnchors();
+                        if (newcrawlingdepth > 0) {
+                            if (fullDomain) {
+                                newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
+                            } else if (subPath) {
+                                newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
+                            }
+                        }
+                    } catch (final Exception e) {
+                        // mist
+                        prop.put("info", "7"); // Error with file
+                        prop.putHTML("info_crawlingStart", crawlingFileName);
+                        prop.putHTML("info_error", e.getMessage());
+                        ConcurrentLog.logException(e);
+                    }
+                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                }
+                
                 // prepare a new crawling profile
                 final CrawlProfile profile;
                 byte[] handle;
@@ -578,32 +613,17 @@ public class Crawler_p {
                         ConcurrentLog.logException(e);
                     }
                 } else if ("file".equals(crawlingMode)) {
-                    if (post.containsKey("crawlingFile") && crawlingFile != null) {
-                        final String crawlingFileContent = post.get("crawlingFile$file", "");
+                    if (post.containsKey("crawlingFile") && crawlingFile != null && hyperlinks_from_file != null) {
                         try {
-                            // check if the crawl filter works correctly
-                            Pattern.compile(newcrawlingMustMatch);
-                            final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
-                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
-                            if (crawlingFile != null && crawlingFile.exists()) {
-                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
-                            } else {
-                                FileUtils.copy(crawlingFileContent, writer);
-                            }
-                            writer.close();
-
-                            // get links and generate filter
-                            final List<AnchorURL> hyperlinks = scraper.getAnchors();
                             if (newcrawlingdepth > 0) {
                                 if (fullDomain) {
-                                    newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks);
+                                    newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
                                 } else if (subPath) {
-                                    newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks);
+                                    newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
                                 }
                             }
-
                             sb.crawler.putActive(handle, profile);
-                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, profile.timezoneOffset());
+                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
                         } catch (final PatternSyntaxException e) {
                             prop.put("info", "4"); // crawlfilter does not match url
                             prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 0030790ee..ad1f8d9d8 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -691,11 +691,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     }
 
     public static String siteFilter(final Collection<? extends MultiProtocolURL> urls) {
-        LinkedHashSet<String> filters = new LinkedHashSet<String>(); // first collect in a set to eliminate doubles
-        for (final MultiProtocolURL url: urls) filters.add(mustMatchFilterFullDomain(url));
         final StringBuilder filter = new StringBuilder();
-        for (final String urlfilter: filters) filter.append('|').append(urlfilter);
-        return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
+        filter.append("(smb|ftp|https?)://(www.)?(");
+        for (final MultiProtocolURL url: urls) {
+            String host = url.getHost();
+            if (host == null) continue;
+            if (host.startsWith("www.")) host = host.substring(4);
+            filter.append(Pattern.quote(host.toLowerCase())).append(".*|");
+        }
+        filter.setCharAt(filter.length() - 1, ')');
+        return filter.toString();
     }
 
     public static String mustMatchFilterFullDomain(final MultiProtocolURL url) {
@@ -721,7 +726,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         if (host.startsWith("www.")) host = host.substring(4);
         String protocol = url.getProtocol();
         if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
-        return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(url.getPath()).append(".*").toString();
+        return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase())).append(url.getPath()).append(".*").toString();
     }
     
     public boolean isPushCrawlProfile() {