From e81b770f791091fe90040d44c5a5acaa4d6d561e Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Wed, 30 Jun 2021 10:45:58 +0200
Subject: [PATCH] enabled crawl starts with very large sets of start urls i.e.
 10MB large url list with approx 0.5 million start points

---
 htroot/Crawler_p.java                         | 421 +++++++++---------
 .../net/yacy/http/Jetty9HttpServerImpl.java   |   3 +
 source/net/yacy/search/Switchboard.java       |  11 +-
 3 files changed, 222 insertions(+), 213 deletions(-)

diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index aa442ea20..5ee2bbd05 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -25,6 +25,7 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.Writer;
 import java.net.MalformedURLException;
+import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -96,7 +97,7 @@ public class Crawler_p {
         // inital values for AJAX Elements (without JavaScript)
         final serverObjects prop = new serverObjects();
         prop.put("rejected", 0);
-        
+
         // check for JSONP
         if (post != null && post.containsKey("callback") ) {
             final String jsonp = post.get("callback") + "([";
@@ -122,18 +123,18 @@ public class Crawler_p {
         prop.putNum("citationSegmentCount", segment.citationSegmentCount());
         prop.putNum("rwipublictextSize", segment.RWICount());
         prop.putNum("rwipublictextSegmentCount", segment.RWISegmentCount());
-        
+
         prop.put("list", "0");
         prop.put("loaderSize", 0);
         prop.put("loaderMax", 0);
         prop.put("list-loader", 0);
-        
+
         int coreCrawlJobSize = sb.crawlQueues.coreCrawlJobSize();
         int limitCrawlJobSize = sb.crawlQueues.limitCrawlJobSize();
         int remoteTriggeredCrawlJobSize = sb.crawlQueues.remoteTriggeredCrawlJobSize();
         int noloadCrawlJobSize = sb.crawlQueues.noloadCrawlJobSize();
         int allsize = coreCrawlJobSize + limitCrawlJobSize + remoteTriggeredCrawlJobSize + noloadCrawlJobSize;
-        
+
         prop.put("localCrawlSize", coreCrawlJobSize);
         prop.put("localCrawlState", "");
         prop.put("limitCrawlSize", limitCrawlJobSize);
@@ -148,7 +149,7 @@ public class Crawler_p {
 
         prop.put("info", "0");
         boolean debug = (post != null && post.containsKey("debug"));
-        
+
         if (post != null) {
             String c = post.toString();
             if (c.length() < 1000) ConcurrentLog.info("Crawl Start", c);
@@ -165,7 +166,7 @@ public class Crawler_p {
                 sb.crawler.removePassive(h);
                 try {sb.crawlQueues.noticeURL.removeByProfileHandle(p.handle(), 10000);} catch (SpaceExceededException e) {}
             }
-            
+
             // clear stacks
             for (StackType stackType: StackType.values()) sb.crawlQueues.noticeURL.clear(stackType);
             try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */}
@@ -206,8 +207,8 @@ public class Crawler_p {
             prop.put("info-queue", 1);
             prop.putHTML("info-queue_message", "pause reason: " + queuemessage);
         }
-        
-    	if (post != null && post.containsKey("terminate")) try {
+
+        if (post != null && post.containsKey("terminate")) try {
             final String handle = post.get("handle", "");
             // termination of a crawl: shift the crawl from active to passive
             final CrawlProfile p = sb.crawler.getActive(handle.getBytes());
@@ -225,13 +226,13 @@ public class Crawler_p {
             if (sb.peers == null) {
                 prop.put("info", "3");
             } else {
-            	
-            	if(post.getBoolean("cleanSearchCache")) {
-            		// clean up all search events
-            		SearchEventCache.cleanupEvents(true);
-            		sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
-            	}
-                
+
+                if(post.getBoolean("cleanSearchCache")) {
+                    // clean up all search events
+                    SearchEventCache.cleanupEvents(true);
+                    sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
+                }
+
                 // remove crawlingFileContent before we record the call
                 String crawlingFileName = post.get("crawlingFile");
                 final File crawlingFile;
@@ -244,7 +245,7 @@ public class Crawler_p {
                 if (crawlingFile != null && crawlingFile.exists()) {
                     post.remove("crawlingFile$file");
                 }
-                
+
                 // prepare some filter that are adjusted in case that this is wanted
                 boolean storeHTCache = "on".equals(post.get("storeHTCache", "off"));
                 String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
@@ -267,6 +268,7 @@ public class Crawler_p {
                 Set<DigestURL> rootURLs = new HashSet<DigestURL>();
                 String crawlName = "";
                 if (crawlingFile == null) for (String crawlingStart: rootURLs0) {
+                    StringBuilder crawlNameBuilder = new StringBuilder(); // for large crawl queues this can be pretty large
                     if (crawlingStart == null || crawlingStart.length() == 0) continue;
                     // add the prefix http:// if necessary
                     int pos = crawlingStart.indexOf("://",0);
@@ -276,14 +278,14 @@ public class Crawler_p {
                     try {
                         DigestURL crawlingStartURL = new DigestURL(crawlingStart);
                         rootURLs.add(crawlingStartURL);
-                        crawlName += ((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()) + ',';
+                        crawlNameBuilder.append((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()).append(',');
                         if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
-                        
                     } catch (final MalformedURLException e) {
                         ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage());
                     }
+                    crawlName = crawlNameBuilder.toString();
                 } else {
-                	crawlName = crawlingFile.getName();
+                    crawlName = crawlingFile.getName();
                 }
                 if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
                 if (crawlName.length() > 64) {
@@ -296,7 +298,7 @@ public class Crawler_p {
                 if (fullDomain) {
                     for (DigestURL u: rootURLs) if (u.isFile()) {fullDomain = false; subPath = true; break;}
                 }
-                
+
                 // delete old robots entries
                 for (DigestURL ru : rootURLs) {
                     sb.robots.delete(ru);
@@ -307,7 +309,7 @@ public class Crawler_p {
                     } catch (IOException e) {}
                 }
                 try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all.
-                
+
                 // set the crawl filter
                 String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
                 final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
@@ -327,7 +329,7 @@ public class Crawler_p {
                 env.setConfig("crawlOrder", crawlOrder);
 
                 if (crawlOrder) crawlerNoDepthLimitMatch = CrawlProfile.MATCH_NEVER_STRING; // without limitation the crawl order does not work
-                
+
                 int newcrawlingdepth = post.getInt("crawlingDepth", 8);
                 env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                 if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
@@ -355,10 +357,10 @@ public class Crawler_p {
 
                 boolean followFrames = "on".equals(post.get("followFrames", "false"));
                 env.setConfig("followFrames", followFrames);
-                
+
                 boolean obeyHtmlRobotsNoindex = "on".equals(post.get("obeyHtmlRobotsNoindex", "false"));
                 env.setConfig("obeyHtmlRobotsNoindex", obeyHtmlRobotsNoindex);
-                
+
                 boolean obeyHtmlRobotsNofollow = "on".equals(post.get("obeyHtmlRobotsNofollow", "false"));
                 env.setConfig("obeyHtmlRobotsNofollow", obeyHtmlRobotsNofollow);
 
@@ -369,7 +371,7 @@ public class Crawler_p {
                 env.setConfig("indexMedia", indexMedia);
 
                 env.setConfig("storeHTCache", storeHTCache);
-                
+
                 String defaultAgentName = sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName;
                 String agentName = post.get("agentName", defaultAgentName);
                 ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
@@ -379,19 +381,19 @@ public class Crawler_p {
                 if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH;
 
                 String crawlingMode = post.get("crawlingMode","url");
-                
+
                 if ("file".equals(crawlingMode) && post.containsKey("crawlingFile")) {
                     newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
                     directDocByURL = false;
                 }
-                
+
                 if ("sitemap".equals(crawlingMode)) {
                     newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING;
                     newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
                     newcrawlingdepth = 0;
                     directDocByURL = false;
                 }
-                
+
                 if ("sitelist".equals(crawlingMode)) {
                     newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
                     Set<DigestURL> newRootURLs = new HashSet<DigestURL>();
@@ -415,19 +417,21 @@ public class Crawler_p {
 
                 // delete all error urls for that domain
                 // and all urls for that host from the crawl queue
+                List<String> deleteIDs = new ArrayList<>();
                 Set<String> hosthashes = new HashSet<String>();
                 boolean anysmbftporpdf = false;
                 for (DigestURL u : rootURLs) {
-                    sb.index.fulltext().remove(u.hash());
+                    deleteIDs.add(new String(u.hash()));
                     hosthashes.add(u.hosthash());
                     if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true;
                 }
+                sb.index.fulltext().remove(deleteIDs);
                 sb.crawlQueues.removeHosts(hosthashes);
                 sb.index.fulltext().commit(true);
-                
+
                 boolean crawlingQ = anysmbftporpdf || "on".equals(post.get("crawlingQ", "off")) || "sitemap".equals(crawlingMode);
                 env.setConfig("crawlingQ", crawlingQ);
-                
+
                 // compute mustmatch filter according to rootURLs
                 if ((fullDomain || subPath) && newcrawlingdepth > 0) {
                     String siteFilter = ".*";
@@ -454,19 +458,21 @@ public class Crawler_p {
                         newcrawlingMustMatch = "(" + newcrawlingMustMatch + ")|(" + siteFilter + ")";
                     }
                 }
-                
+
                 // check if the crawl filter works correctly
                 try {
                     Pattern mmp = Pattern.compile(newcrawlingMustMatch);
+                    int maxcheck = 100;
                     for (DigestURL u: rootURLs) {
                         assert mmp.matcher(u.toNormalform(true)).matches() : "pattern " + mmp.toString() + " does not match url " + u.toNormalform(true);
+                        if (maxcheck-- <= 0) break;
                     }
                 } catch (final PatternSyntaxException e) {
                     prop.put("info", "4"); // crawlfilter does not match url
                     prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                     prop.putHTML("info_error", e.getMessage());
                 }
-                
+
                 boolean hasCrawlstartDataOK = !crawlName.isEmpty();
                 if (hasCrawlstartDataOK) {
                     // check crawlurl was given in sitecrawl
@@ -474,25 +480,25 @@ public class Crawler_p {
                         prop.put("info", "5"); //Crawling failed
                         prop.putHTML("info_crawlingURL", "(no url given)");
                         prop.putHTML("info_reasonString", "you must submit at least one crawl url");
-                    	hasCrawlstartDataOK = false;
+                        hasCrawlstartDataOK = false;
                     }
                 }
-               
+
                 String snapshotsMaxDepthString = post.get("snapshotsMaxDepth", "-1");
                 int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString);
                 boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
                 boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
                 String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
-                
+
                 String ignoreclassname_s = post.get("ignoreclassname");
                 Set<String> ignoreclassname = new HashSet<>();
                 if (ignoreclassname_s != null) {
-                	String[] ignoreclassname_a = ignoreclassname_s.trim().split(",");
-                	for (int i = 0; i < ignoreclassname_a.length; i++) {
-                		ignoreclassname.add(ignoreclassname_a[i].trim());
-                	}
+                    String[] ignoreclassname_a = ignoreclassname_s.trim().split(",");
+                    for (int i = 0; i < ignoreclassname_a.length; i++) {
+                        ignoreclassname.add(ignoreclassname_a[i].trim());
+                    }
                 }
-                
+
                 // get vocabulary scraper info
                 JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
                 for (String key: post.keySet()) {
@@ -518,9 +524,9 @@ public class Crawler_p {
                         }
                     }
                 }
-                
+
                 int timezoneOffset = post.getInt("timezoneOffset", 0);
-                
+
                 // in case that we crawl from a file, load that file and re-compute mustmatch pattern
                 List<AnchorURL> hyperlinks_from_file = null;
                 if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) {
@@ -528,13 +534,13 @@ public class Crawler_p {
                     try {
                         if (newcrawlingdepth > 0) {
                             if (fullDomain) {
-                            	/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. 
-                            	 * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
+                                /* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. 
+                                 * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
                                 hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
                                 newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
                             } else if (subPath) {
-                            	/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. 
-                            	 * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
+                                /* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. 
+                                 * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
                                 hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
                                 newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
                             }
@@ -548,53 +554,50 @@ public class Crawler_p {
                     }
                     sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                 }
-                
+
                 /* If a solr query filter is defined, verify now its syntax and that the embedded Solr schema is available */
-				final String solrQueryMustMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim();
-				final String solrQueryMustNotMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim();
-				if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch)) || !CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
-					
-					final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance();
-					final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
-					final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
-					prop.put("noEmbeddedSolr", !embeddedSolrConnected);
-					if (embeddedSolrConnected) {
-						if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch))) {
-							try {
-								SingleDocumentMatcher.toLuceneQuery(solrQueryMustMatch, embeddedCore);
-							} catch(final SyntaxError | SolrException e) {
-								hasCrawlstartDataOK = false;
-								prop.put("info", "10");
-								prop.put("info_solrQuery", solrQueryMustMatch);
-							} catch(final RuntimeException e) {
-								hasCrawlstartDataOK = false;
-								prop.put("info", "11");
-								prop.put("info_solrQuery", solrQueryMustMatch);
-							}
-						}
-						
-						if(!CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
-							try {
-								SingleDocumentMatcher.toLuceneQuery(solrQueryMustNotMatch, embeddedCore);
-							} catch(final SyntaxError | SolrException e) {
-								hasCrawlstartDataOK = false;
-								prop.put("info", "10");
-								prop.put("info_solrQuery", solrQueryMustNotMatch);
-							} catch(final RuntimeException e) {
-								hasCrawlstartDataOK = false;
-								prop.put("info", "11");
-								prop.put("info_solrQuery", solrQueryMustNotMatch);
-							}
-						}
-					} else {
-						hasCrawlstartDataOK = false;
-						prop.put("info", "9");
-					}
-					
-					
-
-				}
-                
+                final String solrQueryMustMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim();
+                final String solrQueryMustNotMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim();
+                if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch)) || !CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
+
+                    final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance();
+                    final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
+                    final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
+                    prop.put("noEmbeddedSolr", !embeddedSolrConnected);
+                    if (embeddedSolrConnected) {
+                        if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch))) {
+                            try {
+                                SingleDocumentMatcher.toLuceneQuery(solrQueryMustMatch, embeddedCore);
+                            } catch(final SyntaxError | SolrException e) {
+                                hasCrawlstartDataOK = false;
+                                prop.put("info", "10");
+                                prop.put("info_solrQuery", solrQueryMustMatch);
+                            } catch(final RuntimeException e) {
+                                hasCrawlstartDataOK = false;
+                                prop.put("info", "11");
+                                prop.put("info_solrQuery", solrQueryMustMatch);
+                            }
+                        }
+
+                        if(!CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
+                            try {
+                                SingleDocumentMatcher.toLuceneQuery(solrQueryMustNotMatch, embeddedCore);
+                            } catch(final SyntaxError | SolrException e) {
+                                hasCrawlstartDataOK = false;
+                                prop.put("info", "10");
+                                prop.put("info_solrQuery", solrQueryMustNotMatch);
+                            } catch(final RuntimeException e) {
+                                hasCrawlstartDataOK = false;
+                                prop.put("info", "11");
+                                prop.put("info_solrQuery", solrQueryMustNotMatch);
+                            }
+                        }
+                    } else {
+                        hasCrawlstartDataOK = false;
+                        prop.put("info", "9");
+                    }
+                }
+
                 // prepare a new crawling profile
                 final CrawlProfile profile;
                 byte[] handle;
@@ -632,20 +635,19 @@ public class Crawler_p {
                             new VocabularyScraper(vocabulary_scraper),
                             timezoneOffset);
 
-					profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
-							post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
-					profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post
-							.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
-					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
-							post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
-					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
-							.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
-					profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
-					profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
-					profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
-							post.getBoolean("crawlerAlwaysCheckMediaType"));
-					
-                    
+                    profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
+                            post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+                    profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post
+                            .get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
+                    profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
+                            post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+                    profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
+                            .get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
+                    profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
+                    profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
+                    profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
+                            post.getBoolean("crawlerAlwaysCheckMediaType"));
+
                     handle = ASCII.getBytes(profile.handle());
 
                     // before we fire up a new crawl, we make sure that another crawl with the same name is not running
@@ -658,14 +660,12 @@ public class Crawler_p {
                     profile = null;
                     handle = null;
                 }
-                
 
                 // start the crawl
-                if(hasCrawlstartDataOK) {
-                	
-                	final boolean wontReceiptRemoteRsults = crawlOrder && !sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false);
-                	
-                	if ("url".equals(crawlingMode)) {
+                if (hasCrawlstartDataOK) {
+                    final boolean wontReceiptRemoteRsults = crawlOrder && !sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false);
+
+                    if ("url".equals(crawlingMode)) {
                         // stack requests
                         sb.crawler.putActive(handle, profile);
                         final Set<DigestURL> successurls = new HashSet<DigestURL>();
@@ -703,65 +703,64 @@ public class Crawler_p {
                                 sb.crawlQueues.errorURL.push(failure.getKey(), 0, null, FailCategory.FINAL_LOAD_CONTEXT, failure.getValue(), -1);
                                 fr.append(failure.getValue()).append('/');
                             }
-    
+
                             prop.put("info", "5"); //Crawling failed
                             prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                             prop.putHTML("info_reasonString", fr.toString());
                         }
                         if (successurls.size() > 0) {
-                        	sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                        	
-            				prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
+                            sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                            prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
+                        }
+                    } else if ("sitemap".equals(crawlingMode)) {
+                        try {
+                            final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway
+                            sb.crawler.putActive(handle, profile);
+                            final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile);
+                            importer.start();
+                            sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                            prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
+                        } catch (final Exception e) {
+                            // mist
+                            prop.put("info", "6");//Error with url
+                            prop.putHTML("info_crawlingStart", sitemapURLStr);
+                            prop.putHTML("info_error", e.getMessage());
+                            ConcurrentLog.logException(e);
                         }
-                	} else if ("sitemap".equals(crawlingMode)) {
-                		try {
-                			final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway
-                			sb.crawler.putActive(handle, profile);
-                			final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile);
-                			importer.start();
-                			sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-            				prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
-                		} catch (final Exception e) {
-                			// mist
-                			prop.put("info", "6");//Error with url
-                			prop.putHTML("info_crawlingStart", sitemapURLStr);
-                			prop.putHTML("info_error", e.getMessage());
-                			ConcurrentLog.logException(e);
-                		}
-                	} else if ("file".equals(crawlingMode)) {
-                		if (post.containsKey("crawlingFile") && crawlingFile != null) {
-                			try {
-                				if(newcrawlingdepth > 0 && (fullDomain || subPath)) {
-                					/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
-                					if(hyperlinks_from_file != null) {
-                						sb.crawler.putActive(handle, profile);
-                						sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
-                					}
-                				} else {
-                					/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
-                					final String crawlingFileContent = post.get("crawlingFile$file", "");
-                					final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
-                							new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
-                					FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
-                							sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
-                					sb.crawler.putActive(handle, profile);
-                					crawlStarterTask.start();
-                				}
-                			} catch (final PatternSyntaxException e) {
-                				prop.put("info", "4"); // crawlfilter does not match url
-                				prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
-                				prop.putHTML("info_error", e.getMessage());
-                			} catch (final Exception e) {
-                				// mist
-                				prop.put("info", "7"); // Error with file
-                				prop.putHTML("info_crawlingStart", crawlingFileName);
-                				prop.putHTML("info_error", e.getMessage());
-                				ConcurrentLog.logException(e);
-                			}
-                			sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-            				prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
-                		}
-                	}
+                    } else if ("file".equals(crawlingMode)) {
+                        if (post.containsKey("crawlingFile") && crawlingFile != null) {
+                            try {
+                                if(newcrawlingdepth > 0 && (fullDomain || subPath)) {
+                                    /* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
+                                    if(hyperlinks_from_file != null) {
+                                        sb.crawler.putActive(handle, profile);
+                                        sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
+                                    }
+                                } else {
+                                    /* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
+                                    final String crawlingFileContent = post.get("crawlingFile$file", "");
+                                    final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
+                                            new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
+                                    FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
+                                            sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
+                                    sb.crawler.putActive(handle, profile);
+                                    crawlStarterTask.start();
+                                }
+                            } catch (final PatternSyntaxException e) {
+                                prop.put("info", "4"); // crawlfilter does not match url
+                                prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
+                                prop.putHTML("info_error", e.getMessage());
+                            } catch (final Exception e) {
+                                // mist
+                                prop.put("info", "7"); // Error with file
+                                prop.putHTML("info_crawlingStart", crawlingFileName);
+                                prop.putHTML("info_error", e.getMessage());
+                                ConcurrentLog.logException(e);
+                            }
+                            sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                            prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
+                        }
+                    }
                 }
             }
         }
@@ -783,7 +782,7 @@ public class Crawler_p {
             } catch (final NumberFormatException e) {}
             if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
             if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000;
-            
+
             int wPPM = wantedPPM;
             if ( wPPM <= 0 ) {
                 wPPM = 1;
@@ -793,9 +792,9 @@ public class Crawler_p {
             }
             final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60
             final float loadprereq = wantedPPM <= 10 ? 1.0f : wantedPPM <= 100 ? 2.0f : wantedPPM >= 1000 ? 8.0f : 3.0f;
-            
+
             BusyThread thread;
-            
+
             thread = sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
             if ( thread != null ) {
                 sb.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, thread.setBusySleep(newBusySleep));
@@ -826,7 +825,7 @@ public class Crawler_p {
         String hosts = "";
         for (final byte[] h: sb.crawler.getActive()) {
             profile = sb.crawler.getActive(h);
-        	if (CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) continue;
+            if (CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) continue;
             profile.putProfileEntry("crawlProfilesShow_list_", prop, true, dark, count, domlistlength);
             prop.put("crawlProfilesShow_list_" + count + "_debug", debug ? 1 : 0);
             if (debug) {
@@ -877,47 +876,47 @@ public class Crawler_p {
      * @throws IOException
      * @throws FileNotFoundException
      */
-	private static List<AnchorURL> crawlingFileStart(final File crawlingFile, int timezoneOffset,
-			final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
-		List<AnchorURL> hyperlinks_from_file;
-		// check if the crawl filter works correctly
-		final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
-		final Writer writer = new TransformerWriter(null, null, scraper, false);
-		if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
-			/* Let's report here detailed error to help user when he selected a wrong file */
-			if(!crawlingFile.exists()) {
-				throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " does not exists");
-			}
-			if(!crawlingFile.isFile()) {
-				throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " exists but is not a regular file");
-			}
-			if(!crawlingFile.canRead()) {
-				throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
-			}
-		}
-		if (crawlingFile != null) {
-			FileInputStream inStream = null;
-			try {
-				inStream = new FileInputStream(crawlingFile);
-				FileUtils.copy(inStream, writer);
-			} finally {
-				if(inStream != null) {
-					try {
-						inStream.close();
-					} catch(IOException ignoredException) {
-						ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath());
-					}
-				}
-			}
-		} else {
-		    FileUtils.copy(crawlingFileContent, writer);
-		}
-		writer.close();
-
-		// get links and generate filter
-		hyperlinks_from_file = scraper.getAnchors();
-		return hyperlinks_from_file;
-	}
+    private static List<AnchorURL> crawlingFileStart(final File crawlingFile, int timezoneOffset,
+            final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
+        List<AnchorURL> hyperlinks_from_file;
+        // check if the crawl filter works correctly
+        final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
+        final Writer writer = new TransformerWriter(null, null, scraper, false);
+        if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
+            /* Let's report here detailed error to help user when he selected a wrong file */
+            if(!crawlingFile.exists()) {
+                throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " does not exists");
+            }
+            if(!crawlingFile.isFile()) {
+                throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " exists but is not a regular file");
+            }
+            if(!crawlingFile.canRead()) {
+                throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
+            }
+        }
+        if (crawlingFile != null) {
+            FileInputStream inStream = null;
+            try {
+                inStream = new FileInputStream(crawlingFile);
+                FileUtils.copy(inStream, writer);
+            } finally {
+                if(inStream != null) {
+                    try {
+                        inStream.close();
+                    } catch(IOException ignoredException) {
+                        ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath());
+                    }
+                }
+            }
+        } else {
+            FileUtils.copy(crawlingFileContent, writer);
+        }
+        writer.close();
+
+        // get links and generate filter
+        hyperlinks_from_file = scraper.getAnchors();
+        return hyperlinks_from_file;
+    }
 
     private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
         if (!recrawlIfOlderCheck) return null;
diff --git a/source/net/yacy/http/Jetty9HttpServerImpl.java b/source/net/yacy/http/Jetty9HttpServerImpl.java
index 5163dd8d3..83b8ee15a 100644
--- a/source/net/yacy/http/Jetty9HttpServerImpl.java
+++ b/source/net/yacy/http/Jetty9HttpServerImpl.java
@@ -85,7 +85,9 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer {
         connector.setName("httpd:"+Integer.toString(port));
         connector.setIdleTimeout(9000); // timout in ms when no bytes send / received
         connector.setAcceptQueueSize(128);
+        
         server.addConnector(connector);
+        
 
         // add ssl/https connector
         boolean useSSL = sb.getConfigBool("server.https", false);
@@ -202,6 +204,7 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer {
         context.setServer(server);
         context.setContextPath("/");
         context.setHandler(handlers);
+        context.setMaxFormContentSize(1024 * 1024 * 10); // allow 10MB, large forms may be required during crawl starts with long lists
 
         // make YaCy handlers (in context) and servlet context handlers available (both contain root context "/")
         // logic: 1. YaCy handlers are called if request not handled (e.g. proxy) then servlets handle it
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index abd827622..cb9383dd9 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -3757,7 +3757,8 @@ public final class Switchboard extends serverSwitch {
             if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason);
             return;
         }
-        final List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
+        final ArrayList<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
+        int maxthreads = 5 * Runtime.getRuntime().availableProcessors();
         for (DigestURL url: rootURLs) {
             final DigestURL turl = url;
             Thread t = new Thread("Switchboard.stackURLs") {
@@ -3769,7 +3770,13 @@ public final class Switchboard extends serverSwitch {
             };
             t.start();
             stackthreads.add(t);
-            try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second!
+            if (stackthreads.size() > maxthreads) {
+                Thread w = stackthreads.get(0);
+                while (w.isAlive()) {
+                    try {Thread.sleep(100);} catch (final InterruptedException e) {}
+                }
+                stackthreads.remove(0);
+            }
         }
         final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
         for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}