From e81b770f791091fe90040d44c5a5acaa4d6d561e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 30 Jun 2021 10:45:58 +0200 Subject: [PATCH] enabled crawl starts with very large sets of start urls i.e. 10MB large url list with approx 0.5 million start points --- htroot/Crawler_p.java | 421 +++++++++--------- .../net/yacy/http/Jetty9HttpServerImpl.java | 3 + source/net/yacy/search/Switchboard.java | 11 +- 3 files changed, 222 insertions(+), 213 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index aa442ea20..5ee2bbd05 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -25,6 +25,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.Writer; import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -96,7 +97,7 @@ public class Crawler_p { // inital values for AJAX Elements (without JavaScript) final serverObjects prop = new serverObjects(); prop.put("rejected", 0); - + // check for JSONP if (post != null && post.containsKey("callback") ) { final String jsonp = post.get("callback") + "(["; @@ -122,18 +123,18 @@ public class Crawler_p { prop.putNum("citationSegmentCount", segment.citationSegmentCount()); prop.putNum("rwipublictextSize", segment.RWICount()); prop.putNum("rwipublictextSegmentCount", segment.RWISegmentCount()); - + prop.put("list", "0"); prop.put("loaderSize", 0); prop.put("loaderMax", 0); prop.put("list-loader", 0); - + int coreCrawlJobSize = sb.crawlQueues.coreCrawlJobSize(); int limitCrawlJobSize = sb.crawlQueues.limitCrawlJobSize(); int remoteTriggeredCrawlJobSize = sb.crawlQueues.remoteTriggeredCrawlJobSize(); int noloadCrawlJobSize = sb.crawlQueues.noloadCrawlJobSize(); int allsize = coreCrawlJobSize + limitCrawlJobSize + remoteTriggeredCrawlJobSize + noloadCrawlJobSize; - + prop.put("localCrawlSize", coreCrawlJobSize); prop.put("localCrawlState", ""); prop.put("limitCrawlSize", limitCrawlJobSize); @@ -148,7 +149,7 @@ public class Crawler_p { prop.put("info", "0"); boolean debug = (post != null && post.containsKey("debug")); - + if (post != null) { String c = post.toString(); if (c.length() < 1000) ConcurrentLog.info("Crawl Start", c); @@ -165,7 +166,7 @@ public class Crawler_p { sb.crawler.removePassive(h); try {sb.crawlQueues.noticeURL.removeByProfileHandle(p.handle(), 10000);} catch (SpaceExceededException e) {} } - + // clear stacks for (StackType stackType: StackType.values()) sb.crawlQueues.noticeURL.clear(stackType); try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */} @@ -206,8 +207,8 @@ public class Crawler_p { prop.put("info-queue", 1); prop.putHTML("info-queue_message", "pause reason: " + queuemessage); } - - if (post != null && post.containsKey("terminate")) try { + + if (post != null && post.containsKey("terminate")) try { final String handle = post.get("handle", ""); // termination of a crawl: shift the crawl from active to passive final CrawlProfile p = sb.crawler.getActive(handle.getBytes()); @@ -225,13 +226,13 @@ public class Crawler_p { if (sb.peers == null) { prop.put("info", "3"); } else { - - if(post.getBoolean("cleanSearchCache")) { - // clean up all search events - SearchEventCache.cleanupEvents(true); - sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings - } - + + if(post.getBoolean("cleanSearchCache")) { + // clean up all search events + SearchEventCache.cleanupEvents(true); + sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings + } + // remove crawlingFileContent before we record the call String crawlingFileName = post.get("crawlingFile"); final File crawlingFile; @@ -244,7 +245,7 @@ public class Crawler_p { if (crawlingFile != null && crawlingFile.exists()) { post.remove("crawlingFile$file"); } - + // prepare some filter that are adjusted in case that this is wanted boolean storeHTCache = "on".equals(post.get("storeHTCache", "off")); String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING); @@ -267,6 +268,7 @@ public class Crawler_p { Set rootURLs = new HashSet(); String crawlName = ""; if (crawlingFile == null) for (String crawlingStart: rootURLs0) { + StringBuilder crawlNameBuilder = new StringBuilder(); // for large crawl queues this can be pretty large if (crawlingStart == null || crawlingStart.length() == 0) continue; // add the prefix http:// if necessary int pos = crawlingStart.indexOf("://",0); @@ -276,14 +278,14 @@ public class Crawler_p { try { DigestURL crawlingStartURL = new DigestURL(crawlingStart); rootURLs.add(crawlingStartURL); - crawlName += ((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()) + ','; + crawlNameBuilder.append((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()).append(','); if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false; - } catch (final MalformedURLException e) { ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage()); } + crawlName = crawlNameBuilder.toString(); } else { - crawlName = crawlingFile.getName(); + crawlName = crawlingFile.getName(); } if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1); if (crawlName.length() > 64) { @@ -296,7 +298,7 @@ public class Crawler_p { if (fullDomain) { for (DigestURL u: rootURLs) if (u.isFile()) {fullDomain = false; subPath = true; break;} } - + // delete old robots entries for (DigestURL ru : rootURLs) { sb.robots.delete(ru); @@ -307,7 +309,7 @@ public class Crawler_p { } catch (IOException e) {} } try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all. - + // set the crawl filter String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING); final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING); @@ -327,7 +329,7 @@ public class Crawler_p { env.setConfig("crawlOrder", crawlOrder); if (crawlOrder) crawlerNoDepthLimitMatch = CrawlProfile.MATCH_NEVER_STRING; // without limitation the crawl order does not work - + int newcrawlingdepth = post.getInt("crawlingDepth", 8); env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; @@ -355,10 +357,10 @@ public class Crawler_p { boolean followFrames = "on".equals(post.get("followFrames", "false")); env.setConfig("followFrames", followFrames); - + boolean obeyHtmlRobotsNoindex = "on".equals(post.get("obeyHtmlRobotsNoindex", "false")); env.setConfig("obeyHtmlRobotsNoindex", obeyHtmlRobotsNoindex); - + boolean obeyHtmlRobotsNofollow = "on".equals(post.get("obeyHtmlRobotsNofollow", "false")); env.setConfig("obeyHtmlRobotsNofollow", obeyHtmlRobotsNofollow); @@ -369,7 +371,7 @@ public class Crawler_p { env.setConfig("indexMedia", indexMedia); env.setConfig("storeHTCache", storeHTCache); - + String defaultAgentName = sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName; String agentName = post.get("agentName", defaultAgentName); ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName); @@ -379,19 +381,19 @@ public class Crawler_p { if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH; String crawlingMode = post.get("crawlingMode","url"); - + if ("file".equals(crawlingMode) && post.containsKey("crawlingFile")) { newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING; directDocByURL = false; } - + if ("sitemap".equals(crawlingMode)) { newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING; newcrawlingdepth = 0; directDocByURL = false; } - + if ("sitelist".equals(crawlingMode)) { newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING; Set newRootURLs = new HashSet(); @@ -415,19 +417,21 @@ public class Crawler_p { // delete all error urls for that domain // and all urls for that host from the crawl queue + List deleteIDs = new ArrayList<>(); Set hosthashes = new HashSet(); boolean anysmbftporpdf = false; for (DigestURL u : rootURLs) { - sb.index.fulltext().remove(u.hash()); + deleteIDs.add(new String(u.hash())); hosthashes.add(u.hosthash()); if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true; } + sb.index.fulltext().remove(deleteIDs); sb.crawlQueues.removeHosts(hosthashes); sb.index.fulltext().commit(true); - + boolean crawlingQ = anysmbftporpdf || "on".equals(post.get("crawlingQ", "off")) || "sitemap".equals(crawlingMode); env.setConfig("crawlingQ", crawlingQ); - + // compute mustmatch filter according to rootURLs if ((fullDomain || subPath) && newcrawlingdepth > 0) { String siteFilter = ".*"; @@ -454,19 +458,21 @@ public class Crawler_p { newcrawlingMustMatch = "(" + newcrawlingMustMatch + ")|(" + siteFilter + ")"; } } - + // check if the crawl filter works correctly try { Pattern mmp = Pattern.compile(newcrawlingMustMatch); + int maxcheck = 100; for (DigestURL u: rootURLs) { assert mmp.matcher(u.toNormalform(true)).matches() : "pattern " + mmp.toString() + " does not match url " + u.toNormalform(true); + if (maxcheck-- <= 0) break; } } catch (final PatternSyntaxException e) { prop.put("info", "4"); // crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_error", e.getMessage()); } - + boolean hasCrawlstartDataOK = !crawlName.isEmpty(); if (hasCrawlstartDataOK) { // check crawlurl was given in sitecrawl @@ -474,25 +480,25 @@ public class Crawler_p { prop.put("info", "5"); //Crawling failed prop.putHTML("info_crawlingURL", "(no url given)"); prop.putHTML("info_reasonString", "you must submit at least one crawl url"); - hasCrawlstartDataOK = false; + hasCrawlstartDataOK = false; } } - + String snapshotsMaxDepthString = post.get("snapshotsMaxDepth", "-1"); int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString); boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage"); boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld"); String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", ""); - + String ignoreclassname_s = post.get("ignoreclassname"); Set ignoreclassname = new HashSet<>(); if (ignoreclassname_s != null) { - String[] ignoreclassname_a = ignoreclassname_s.trim().split(","); - for (int i = 0; i < ignoreclassname_a.length; i++) { - ignoreclassname.add(ignoreclassname_a[i].trim()); - } + String[] ignoreclassname_a = ignoreclassname_s.trim().split(","); + for (int i = 0; i < ignoreclassname_a.length; i++) { + ignoreclassname.add(ignoreclassname_a[i].trim()); + } } - + // get vocabulary scraper info JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context for (String key: post.keySet()) { @@ -518,9 +524,9 @@ public class Crawler_p { } } } - + int timezoneOffset = post.getInt("timezoneOffset", 0); - + // in case that we crawl from a file, load that file and re-compute mustmatch pattern List hyperlinks_from_file = null; if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) { @@ -528,13 +534,13 @@ public class Crawler_p { try { if (newcrawlingdepth > 0) { if (fullDomain) { - /* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. - * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */ + /* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. + * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */ hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent); newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file); } else if (subPath) { - /* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. - * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */ + /* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. + * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */ hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent); newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file); } @@ -548,53 +554,50 @@ public class Crawler_p { } sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); } - + /* If a solr query filter is defined, verify now its syntax and that the embedded Solr schema is available */ - final String solrQueryMustMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim(); - final String solrQueryMustNotMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim(); - if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch)) || !CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) { - - final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance(); - final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null; - final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null; - prop.put("noEmbeddedSolr", !embeddedSolrConnected); - if (embeddedSolrConnected) { - if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch))) { - try { - SingleDocumentMatcher.toLuceneQuery(solrQueryMustMatch, embeddedCore); - } catch(final SyntaxError | SolrException e) { - hasCrawlstartDataOK = false; - prop.put("info", "10"); - prop.put("info_solrQuery", solrQueryMustMatch); - } catch(final RuntimeException e) { - hasCrawlstartDataOK = false; - prop.put("info", "11"); - prop.put("info_solrQuery", solrQueryMustMatch); - } - } - - if(!CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) { - try { - SingleDocumentMatcher.toLuceneQuery(solrQueryMustNotMatch, embeddedCore); - } catch(final SyntaxError | SolrException e) { - hasCrawlstartDataOK = false; - prop.put("info", "10"); - prop.put("info_solrQuery", solrQueryMustNotMatch); - } catch(final RuntimeException e) { - hasCrawlstartDataOK = false; - prop.put("info", "11"); - prop.put("info_solrQuery", solrQueryMustNotMatch); - } - } - } else { - hasCrawlstartDataOK = false; - prop.put("info", "9"); - } - - - - } - + final String solrQueryMustMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim(); + final String solrQueryMustNotMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim(); + if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch)) || !CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) { + + final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance(); + final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null; + final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null; + prop.put("noEmbeddedSolr", !embeddedSolrConnected); + if (embeddedSolrConnected) { + if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch))) { + try { + SingleDocumentMatcher.toLuceneQuery(solrQueryMustMatch, embeddedCore); + } catch(final SyntaxError | SolrException e) { + hasCrawlstartDataOK = false; + prop.put("info", "10"); + prop.put("info_solrQuery", solrQueryMustMatch); + } catch(final RuntimeException e) { + hasCrawlstartDataOK = false; + prop.put("info", "11"); + prop.put("info_solrQuery", solrQueryMustMatch); + } + } + + if(!CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) { + try { + SingleDocumentMatcher.toLuceneQuery(solrQueryMustNotMatch, embeddedCore); + } catch(final SyntaxError | SolrException e) { + hasCrawlstartDataOK = false; + prop.put("info", "10"); + prop.put("info_solrQuery", solrQueryMustNotMatch); + } catch(final RuntimeException e) { + hasCrawlstartDataOK = false; + prop.put("info", "11"); + prop.put("info_solrQuery", solrQueryMustNotMatch); + } + } + } else { + hasCrawlstartDataOK = false; + prop.put("info", "9"); + } + } + // prepare a new crawling profile final CrawlProfile profile; byte[] handle; @@ -632,20 +635,19 @@ public class Crawler_p { new VocabularyScraper(vocabulary_scraper), timezoneOffset); - profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, - post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); - profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post - .get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); - profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, - post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); - profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post - .get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); - profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch); - profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch); - profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, - post.getBoolean("crawlerAlwaysCheckMediaType")); - - + profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, + post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); + profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post + .get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); + profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, + post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); + profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post + .get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); + profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch); + profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch); + profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, + post.getBoolean("crawlerAlwaysCheckMediaType")); + handle = ASCII.getBytes(profile.handle()); // before we fire up a new crawl, we make sure that another crawl with the same name is not running @@ -658,14 +660,12 @@ public class Crawler_p { profile = null; handle = null; } - // start the crawl - if(hasCrawlstartDataOK) { - - final boolean wontReceiptRemoteRsults = crawlOrder && !sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false); - - if ("url".equals(crawlingMode)) { + if (hasCrawlstartDataOK) { + final boolean wontReceiptRemoteRsults = crawlOrder && !sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false); + + if ("url".equals(crawlingMode)) { // stack requests sb.crawler.putActive(handle, profile); final Set successurls = new HashSet(); @@ -703,65 +703,64 @@ public class Crawler_p { sb.crawlQueues.errorURL.push(failure.getKey(), 0, null, FailCategory.FINAL_LOAD_CONTEXT, failure.getValue(), -1); fr.append(failure.getValue()).append('/'); } - + prop.put("info", "5"); //Crawling failed prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); prop.putHTML("info_reasonString", fr.toString()); } if (successurls.size() > 0) { - sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - - prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults); + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults); + } + } else if ("sitemap".equals(crawlingMode)) { + try { + final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway + sb.crawler.putActive(handle, profile); + final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile); + importer.start(); + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults); + } catch (final Exception e) { + // mist + prop.put("info", "6");//Error with url + prop.putHTML("info_crawlingStart", sitemapURLStr); + prop.putHTML("info_error", e.getMessage()); + ConcurrentLog.logException(e); } - } else if ("sitemap".equals(crawlingMode)) { - try { - final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway - sb.crawler.putActive(handle, profile); - final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile); - importer.start(); - sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults); - } catch (final Exception e) { - // mist - prop.put("info", "6");//Error with url - prop.putHTML("info_crawlingStart", sitemapURLStr); - prop.putHTML("info_error", e.getMessage()); - ConcurrentLog.logException(e); - } - } else if ("file".equals(crawlingMode)) { - if (post.containsKey("crawlingFile") && crawlingFile != null) { - try { - if(newcrawlingdepth > 0 && (fullDomain || subPath)) { - /* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */ - if(hyperlinks_from_file != null) { - sb.crawler.putActive(handle, profile); - sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset()); - } - } else { - /* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */ - final String crawlingFileContent = post.get("crawlingFile$file", ""); - final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, - new HashSet(), new VocabularyScraper(), profile.timezoneOffset()); - FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile, - sb.crawlStacker, sb.peers.mySeed().hash.getBytes()); - sb.crawler.putActive(handle, profile); - crawlStarterTask.start(); - } - } catch (final PatternSyntaxException e) { - prop.put("info", "4"); // crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); - prop.putHTML("info_error", e.getMessage()); - } catch (final Exception e) { - // mist - prop.put("info", "7"); // Error with file - prop.putHTML("info_crawlingStart", crawlingFileName); - prop.putHTML("info_error", e.getMessage()); - ConcurrentLog.logException(e); - } - sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults); - } - } + } else if ("file".equals(crawlingMode)) { + if (post.containsKey("crawlingFile") && crawlingFile != null) { + try { + if(newcrawlingdepth > 0 && (fullDomain || subPath)) { + /* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */ + if(hyperlinks_from_file != null) { + sb.crawler.putActive(handle, profile); + sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset()); + } + } else { + /* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */ + final String crawlingFileContent = post.get("crawlingFile$file", ""); + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, + new HashSet(), new VocabularyScraper(), profile.timezoneOffset()); + FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile, + sb.crawlStacker, sb.peers.mySeed().hash.getBytes()); + sb.crawler.putActive(handle, profile); + crawlStarterTask.start(); + } + } catch (final PatternSyntaxException e) { + prop.put("info", "4"); // crawlfilter does not match url + prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); + prop.putHTML("info_error", e.getMessage()); + } catch (final Exception e) { + // mist + prop.put("info", "7"); // Error with file + prop.putHTML("info_crawlingStart", crawlingFileName); + prop.putHTML("info_error", e.getMessage()); + ConcurrentLog.logException(e); + } + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults); + } + } } } } @@ -783,7 +782,7 @@ public class Crawler_p { } catch (final NumberFormatException e) {} if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10; if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000; - + int wPPM = wantedPPM; if ( wPPM <= 0 ) { wPPM = 1; @@ -793,9 +792,9 @@ public class Crawler_p { } final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60 final float loadprereq = wantedPPM <= 10 ? 1.0f : wantedPPM <= 100 ? 2.0f : wantedPPM >= 1000 ? 8.0f : 3.0f; - + BusyThread thread; - + thread = sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); if ( thread != null ) { sb.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, thread.setBusySleep(newBusySleep)); @@ -826,7 +825,7 @@ public class Crawler_p { String hosts = ""; for (final byte[] h: sb.crawler.getActive()) { profile = sb.crawler.getActive(h); - if (CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) continue; + if (CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) continue; profile.putProfileEntry("crawlProfilesShow_list_", prop, true, dark, count, domlistlength); prop.put("crawlProfilesShow_list_" + count + "_debug", debug ? 1 : 0); if (debug) { @@ -877,47 +876,47 @@ public class Crawler_p { * @throws IOException * @throws FileNotFoundException */ - private static List crawlingFileStart(final File crawlingFile, int timezoneOffset, - final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException { - List hyperlinks_from_file; - // check if the crawl filter works correctly - final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet(), new VocabularyScraper(), timezoneOffset); - final Writer writer = new TransformerWriter(null, null, scraper, false); - if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) { - /* Let's report here detailed error to help user when he selected a wrong file */ - if(!crawlingFile.exists()) { - throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists"); - } - if(!crawlingFile.isFile()) { - throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file"); - } - if(!crawlingFile.canRead()) { - throw new IOException("Can not read : " + crawlingFile.getAbsolutePath()); - } - } - if (crawlingFile != null) { - FileInputStream inStream = null; - try { - inStream = new FileInputStream(crawlingFile); - FileUtils.copy(inStream, writer); - } finally { - if(inStream != null) { - try { - inStream.close(); - } catch(IOException ignoredException) { - ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath()); - } - } - } - } else { - FileUtils.copy(crawlingFileContent, writer); - } - writer.close(); - - // get links and generate filter - hyperlinks_from_file = scraper.getAnchors(); - return hyperlinks_from_file; - } + private static List crawlingFileStart(final File crawlingFile, int timezoneOffset, + final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException { + List hyperlinks_from_file; + // check if the crawl filter works correctly + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet(), new VocabularyScraper(), timezoneOffset); + final Writer writer = new TransformerWriter(null, null, scraper, false); + if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) { + /* Let's report here detailed error to help user when he selected a wrong file */ + if(!crawlingFile.exists()) { + throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists"); + } + if(!crawlingFile.isFile()) { + throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file"); + } + if(!crawlingFile.canRead()) { + throw new IOException("Can not read : " + crawlingFile.getAbsolutePath()); + } + } + if (crawlingFile != null) { + FileInputStream inStream = null; + try { + inStream = new FileInputStream(crawlingFile); + FileUtils.copy(inStream, writer); + } finally { + if(inStream != null) { + try { + inStream.close(); + } catch(IOException ignoredException) { + ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath()); + } + } + } + } else { + FileUtils.copy(crawlingFileContent, writer); + } + writer.close(); + + // get links and generate filter + hyperlinks_from_file = scraper.getAnchors(); + return hyperlinks_from_file; + } private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) { if (!recrawlIfOlderCheck) return null; diff --git a/source/net/yacy/http/Jetty9HttpServerImpl.java b/source/net/yacy/http/Jetty9HttpServerImpl.java index 5163dd8d3..83b8ee15a 100644 --- a/source/net/yacy/http/Jetty9HttpServerImpl.java +++ b/source/net/yacy/http/Jetty9HttpServerImpl.java @@ -85,7 +85,9 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer { connector.setName("httpd:"+Integer.toString(port)); connector.setIdleTimeout(9000); // timout in ms when no bytes send / received connector.setAcceptQueueSize(128); + server.addConnector(connector); + // add ssl/https connector boolean useSSL = sb.getConfigBool("server.https", false); @@ -202,6 +204,7 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer { context.setServer(server); context.setContextPath("/"); context.setHandler(handlers); + context.setMaxFormContentSize(1024 * 1024 * 10); // allow 10MB, large forms may be required during crawl starts with long lists // make YaCy handlers (in context) and servlet context handlers available (both contain root context "/") // logic: 1. YaCy handlers are called if request not handled (e.g. proxy) then servlets handle it diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index abd827622..cb9383dd9 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3757,7 +3757,8 @@ public final class Switchboard extends serverSwitch { if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason); return; } - final List stackthreads = new ArrayList(); // do this concurrently + final ArrayList stackthreads = new ArrayList(); // do this concurrently + int maxthreads = 5 * Runtime.getRuntime().availableProcessors(); for (DigestURL url: rootURLs) { final DigestURL turl = url; Thread t = new Thread("Switchboard.stackURLs") { @@ -3769,7 +3770,13 @@ public final class Switchboard extends serverSwitch { }; t.start(); stackthreads.add(t); - try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second! + if (stackthreads.size() > maxthreads) { + Thread w = stackthreads.get(0); + while (w.isAlive()) { + try {Thread.sleep(100);} catch (final InterruptedException e) {} + } + stackthreads.remove(0); + } } final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}