enabled crawl starts with very large sets of start urls

i.e. 10MB large url list with approx 0.5 million start points
pull/419/head
Michael Peter Christen 4 years ago
parent c623a3252e
commit e81b770f79

@ -25,6 +25,7 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@ -96,7 +97,7 @@ public class Crawler_p {
// inital values for AJAX Elements (without JavaScript)
final serverObjects prop = new serverObjects();
prop.put("rejected", 0);
// check for JSONP
if (post != null && post.containsKey("callback") ) {
final String jsonp = post.get("callback") + "([";
@ -122,18 +123,18 @@ public class Crawler_p {
prop.putNum("citationSegmentCount", segment.citationSegmentCount());
prop.putNum("rwipublictextSize", segment.RWICount());
prop.putNum("rwipublictextSegmentCount", segment.RWISegmentCount());
prop.put("list", "0");
prop.put("loaderSize", 0);
prop.put("loaderMax", 0);
prop.put("list-loader", 0);
int coreCrawlJobSize = sb.crawlQueues.coreCrawlJobSize();
int limitCrawlJobSize = sb.crawlQueues.limitCrawlJobSize();
int remoteTriggeredCrawlJobSize = sb.crawlQueues.remoteTriggeredCrawlJobSize();
int noloadCrawlJobSize = sb.crawlQueues.noloadCrawlJobSize();
int allsize = coreCrawlJobSize + limitCrawlJobSize + remoteTriggeredCrawlJobSize + noloadCrawlJobSize;
prop.put("localCrawlSize", coreCrawlJobSize);
prop.put("localCrawlState", "");
prop.put("limitCrawlSize", limitCrawlJobSize);
@ -148,7 +149,7 @@ public class Crawler_p {
prop.put("info", "0");
boolean debug = (post != null && post.containsKey("debug"));
if (post != null) {
String c = post.toString();
if (c.length() < 1000) ConcurrentLog.info("Crawl Start", c);
@ -165,7 +166,7 @@ public class Crawler_p {
sb.crawler.removePassive(h);
try {sb.crawlQueues.noticeURL.removeByProfileHandle(p.handle(), 10000);} catch (SpaceExceededException e) {}
}
// clear stacks
for (StackType stackType: StackType.values()) sb.crawlQueues.noticeURL.clear(stackType);
try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */}
@ -206,8 +207,8 @@ public class Crawler_p {
prop.put("info-queue", 1);
prop.putHTML("info-queue_message", "pause reason: " + queuemessage);
}
if (post != null && post.containsKey("terminate")) try {
if (post != null && post.containsKey("terminate")) try {
final String handle = post.get("handle", "");
// termination of a crawl: shift the crawl from active to passive
final CrawlProfile p = sb.crawler.getActive(handle.getBytes());
@ -225,13 +226,13 @@ public class Crawler_p {
if (sb.peers == null) {
prop.put("info", "3");
} else {
if(post.getBoolean("cleanSearchCache")) {
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
}
if(post.getBoolean("cleanSearchCache")) {
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
}
// remove crawlingFileContent before we record the call
String crawlingFileName = post.get("crawlingFile");
final File crawlingFile;
@ -244,7 +245,7 @@ public class Crawler_p {
if (crawlingFile != null && crawlingFile.exists()) {
post.remove("crawlingFile$file");
}
// prepare some filter that are adjusted in case that this is wanted
boolean storeHTCache = "on".equals(post.get("storeHTCache", "off"));
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
@ -267,6 +268,7 @@ public class Crawler_p {
Set<DigestURL> rootURLs = new HashSet<DigestURL>();
String crawlName = "";
if (crawlingFile == null) for (String crawlingStart: rootURLs0) {
StringBuilder crawlNameBuilder = new StringBuilder(); // for large crawl queues this can be pretty large
if (crawlingStart == null || crawlingStart.length() == 0) continue;
// add the prefix http:// if necessary
int pos = crawlingStart.indexOf("://",0);
@ -276,14 +278,14 @@ public class Crawler_p {
try {
DigestURL crawlingStartURL = new DigestURL(crawlingStart);
rootURLs.add(crawlingStartURL);
crawlName += ((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()) + ',';
crawlNameBuilder.append((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()).append(',');
if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
} catch (final MalformedURLException e) {
ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage());
}
crawlName = crawlNameBuilder.toString();
} else {
crawlName = crawlingFile.getName();
crawlName = crawlingFile.getName();
}
if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
if (crawlName.length() > 64) {
@ -296,7 +298,7 @@ public class Crawler_p {
if (fullDomain) {
for (DigestURL u: rootURLs) if (u.isFile()) {fullDomain = false; subPath = true; break;}
}
// delete old robots entries
for (DigestURL ru : rootURLs) {
sb.robots.delete(ru);
@ -307,7 +309,7 @@ public class Crawler_p {
} catch (IOException e) {}
}
try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all.
// set the crawl filter
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
@ -327,7 +329,7 @@ public class Crawler_p {
env.setConfig("crawlOrder", crawlOrder);
if (crawlOrder) crawlerNoDepthLimitMatch = CrawlProfile.MATCH_NEVER_STRING; // without limitation the crawl order does not work
int newcrawlingdepth = post.getInt("crawlingDepth", 8);
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
@ -355,10 +357,10 @@ public class Crawler_p {
boolean followFrames = "on".equals(post.get("followFrames", "false"));
env.setConfig("followFrames", followFrames);
boolean obeyHtmlRobotsNoindex = "on".equals(post.get("obeyHtmlRobotsNoindex", "false"));
env.setConfig("obeyHtmlRobotsNoindex", obeyHtmlRobotsNoindex);
boolean obeyHtmlRobotsNofollow = "on".equals(post.get("obeyHtmlRobotsNofollow", "false"));
env.setConfig("obeyHtmlRobotsNofollow", obeyHtmlRobotsNofollow);
@ -369,7 +371,7 @@ public class Crawler_p {
env.setConfig("indexMedia", indexMedia);
env.setConfig("storeHTCache", storeHTCache);
String defaultAgentName = sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName;
String agentName = post.get("agentName", defaultAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
@ -379,19 +381,19 @@ public class Crawler_p {
if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH;
String crawlingMode = post.get("crawlingMode","url");
if ("file".equals(crawlingMode) && post.containsKey("crawlingFile")) {
newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
directDocByURL = false;
}
if ("sitemap".equals(crawlingMode)) {
newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING;
newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
newcrawlingdepth = 0;
directDocByURL = false;
}
if ("sitelist".equals(crawlingMode)) {
newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
Set<DigestURL> newRootURLs = new HashSet<DigestURL>();
@ -415,19 +417,21 @@ public class Crawler_p {
// delete all error urls for that domain
// and all urls for that host from the crawl queue
List<String> deleteIDs = new ArrayList<>();
Set<String> hosthashes = new HashSet<String>();
boolean anysmbftporpdf = false;
for (DigestURL u : rootURLs) {
sb.index.fulltext().remove(u.hash());
deleteIDs.add(new String(u.hash()));
hosthashes.add(u.hosthash());
if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true;
}
sb.index.fulltext().remove(deleteIDs);
sb.crawlQueues.removeHosts(hosthashes);
sb.index.fulltext().commit(true);
boolean crawlingQ = anysmbftporpdf || "on".equals(post.get("crawlingQ", "off")) || "sitemap".equals(crawlingMode);
env.setConfig("crawlingQ", crawlingQ);
// compute mustmatch filter according to rootURLs
if ((fullDomain || subPath) && newcrawlingdepth > 0) {
String siteFilter = ".*";
@ -454,19 +458,21 @@ public class Crawler_p {
newcrawlingMustMatch = "(" + newcrawlingMustMatch + ")|(" + siteFilter + ")";
}
}
// check if the crawl filter works correctly
try {
Pattern mmp = Pattern.compile(newcrawlingMustMatch);
int maxcheck = 100;
for (DigestURL u: rootURLs) {
assert mmp.matcher(u.toNormalform(true)).matches() : "pattern " + mmp.toString() + " does not match url " + u.toNormalform(true);
if (maxcheck-- <= 0) break;
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
}
boolean hasCrawlstartDataOK = !crawlName.isEmpty();
if (hasCrawlstartDataOK) {
// check crawlurl was given in sitecrawl
@ -474,25 +480,25 @@ public class Crawler_p {
prop.put("info", "5"); //Crawling failed
prop.putHTML("info_crawlingURL", "(no url given)");
prop.putHTML("info_reasonString", "you must submit at least one crawl url");
hasCrawlstartDataOK = false;
hasCrawlstartDataOK = false;
}
}
String snapshotsMaxDepthString = post.get("snapshotsMaxDepth", "-1");
int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString);
boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
String ignoreclassname_s = post.get("ignoreclassname");
Set<String> ignoreclassname = new HashSet<>();
if (ignoreclassname_s != null) {
String[] ignoreclassname_a = ignoreclassname_s.trim().split(",");
for (int i = 0; i < ignoreclassname_a.length; i++) {
ignoreclassname.add(ignoreclassname_a[i].trim());
}
String[] ignoreclassname_a = ignoreclassname_s.trim().split(",");
for (int i = 0; i < ignoreclassname_a.length; i++) {
ignoreclassname.add(ignoreclassname_a[i].trim());
}
}
// get vocabulary scraper info
JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
for (String key: post.keySet()) {
@ -518,9 +524,9 @@ public class Crawler_p {
}
}
}
int timezoneOffset = post.getInt("timezoneOffset", 0);
// in case that we crawl from a file, load that file and re-compute mustmatch pattern
List<AnchorURL> hyperlinks_from_file = null;
if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) {
@ -528,13 +534,13 @@ public class Crawler_p {
try {
if (newcrawlingdepth > 0) {
if (fullDomain) {
/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now.
* Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now.
* Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
} else if (subPath) {
/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now.
* Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now.
* Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
}
@ -548,53 +554,50 @@ public class Crawler_p {
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
}
/* If a solr query filter is defined, verify now its syntax and that the embedded Solr schema is available */
final String solrQueryMustMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim();
final String solrQueryMustNotMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim();
if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch)) || !CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance();
final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
prop.put("noEmbeddedSolr", !embeddedSolrConnected);
if (embeddedSolrConnected) {
if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch))) {
try {
SingleDocumentMatcher.toLuceneQuery(solrQueryMustMatch, embeddedCore);
} catch(final SyntaxError | SolrException e) {
hasCrawlstartDataOK = false;
prop.put("info", "10");
prop.put("info_solrQuery", solrQueryMustMatch);
} catch(final RuntimeException e) {
hasCrawlstartDataOK = false;
prop.put("info", "11");
prop.put("info_solrQuery", solrQueryMustMatch);
}
}
if(!CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
try {
SingleDocumentMatcher.toLuceneQuery(solrQueryMustNotMatch, embeddedCore);
} catch(final SyntaxError | SolrException e) {
hasCrawlstartDataOK = false;
prop.put("info", "10");
prop.put("info_solrQuery", solrQueryMustNotMatch);
} catch(final RuntimeException e) {
hasCrawlstartDataOK = false;
prop.put("info", "11");
prop.put("info_solrQuery", solrQueryMustNotMatch);
}
}
} else {
hasCrawlstartDataOK = false;
prop.put("info", "9");
}
}
final String solrQueryMustMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim();
final String solrQueryMustNotMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim();
if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch)) || !CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance();
final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
prop.put("noEmbeddedSolr", !embeddedSolrConnected);
if (embeddedSolrConnected) {
if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch))) {
try {
SingleDocumentMatcher.toLuceneQuery(solrQueryMustMatch, embeddedCore);
} catch(final SyntaxError | SolrException e) {
hasCrawlstartDataOK = false;
prop.put("info", "10");
prop.put("info_solrQuery", solrQueryMustMatch);
} catch(final RuntimeException e) {
hasCrawlstartDataOK = false;
prop.put("info", "11");
prop.put("info_solrQuery", solrQueryMustMatch);
}
}
if(!CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
try {
SingleDocumentMatcher.toLuceneQuery(solrQueryMustNotMatch, embeddedCore);
} catch(final SyntaxError | SolrException e) {
hasCrawlstartDataOK = false;
prop.put("info", "10");
prop.put("info_solrQuery", solrQueryMustNotMatch);
} catch(final RuntimeException e) {
hasCrawlstartDataOK = false;
prop.put("info", "11");
prop.put("info_solrQuery", solrQueryMustNotMatch);
}
}
} else {
hasCrawlstartDataOK = false;
prop.put("info", "9");
}
}
// prepare a new crawling profile
final CrawlProfile profile;
byte[] handle;
@ -632,20 +635,19 @@ public class Crawler_p {
new VocabularyScraper(vocabulary_scraper),
timezoneOffset);
profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post
.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
post.getBoolean("crawlerAlwaysCheckMediaType"));
profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post
.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
post.getBoolean("crawlerAlwaysCheckMediaType"));
handle = ASCII.getBytes(profile.handle());
// before we fire up a new crawl, we make sure that another crawl with the same name is not running
@ -658,14 +660,12 @@ public class Crawler_p {
profile = null;
handle = null;
}
// start the crawl
if(hasCrawlstartDataOK) {
final boolean wontReceiptRemoteRsults = crawlOrder && !sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false);
if ("url".equals(crawlingMode)) {
if (hasCrawlstartDataOK) {
final boolean wontReceiptRemoteRsults = crawlOrder && !sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false);
if ("url".equals(crawlingMode)) {
// stack requests
sb.crawler.putActive(handle, profile);
final Set<DigestURL> successurls = new HashSet<DigestURL>();
@ -703,65 +703,64 @@ public class Crawler_p {
sb.crawlQueues.errorURL.push(failure.getKey(), 0, null, FailCategory.FINAL_LOAD_CONTEXT, failure.getValue(), -1);
fr.append(failure.getValue()).append('/');
}
prop.put("info", "5"); //Crawling failed
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
prop.putHTML("info_reasonString", fr.toString());
}
if (successurls.size() > 0) {
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
}
} else if ("sitemap".equals(crawlingMode)) {
try {
final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway
sb.crawler.putActive(handle, profile);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile);
importer.start();
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", sitemapURLStr);
prop.putHTML("info_error", e.getMessage());
ConcurrentLog.logException(e);
}
} else if ("sitemap".equals(crawlingMode)) {
try {
final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway
sb.crawler.putActive(handle, profile);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile);
importer.start();
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", sitemapURLStr);
prop.putHTML("info_error", e.getMessage());
ConcurrentLog.logException(e);
}
} else if ("file".equals(crawlingMode)) {
if (post.containsKey("crawlingFile") && crawlingFile != null) {
try {
if(newcrawlingdepth > 0 && (fullDomain || subPath)) {
/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
if(hyperlinks_from_file != null) {
sb.crawler.putActive(handle, profile);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
}
} else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile);
crawlStarterTask.start();
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "7"); // Error with file
prop.putHTML("info_crawlingStart", crawlingFileName);
prop.putHTML("info_error", e.getMessage());
ConcurrentLog.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
}
}
} else if ("file".equals(crawlingMode)) {
if (post.containsKey("crawlingFile") && crawlingFile != null) {
try {
if(newcrawlingdepth > 0 && (fullDomain || subPath)) {
/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
if(hyperlinks_from_file != null) {
sb.crawler.putActive(handle, profile);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
}
} else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile);
crawlStarterTask.start();
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "7"); // Error with file
prop.putHTML("info_crawlingStart", crawlingFileName);
prop.putHTML("info_error", e.getMessage());
ConcurrentLog.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
}
}
}
}
}
@ -783,7 +782,7 @@ public class Crawler_p {
} catch (final NumberFormatException e) {}
if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000;
int wPPM = wantedPPM;
if ( wPPM <= 0 ) {
wPPM = 1;
@ -793,9 +792,9 @@ public class Crawler_p {
}
final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60
final float loadprereq = wantedPPM <= 10 ? 1.0f : wantedPPM <= 100 ? 2.0f : wantedPPM >= 1000 ? 8.0f : 3.0f;
BusyThread thread;
thread = sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
if ( thread != null ) {
sb.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, thread.setBusySleep(newBusySleep));
@ -826,7 +825,7 @@ public class Crawler_p {
String hosts = "";
for (final byte[] h: sb.crawler.getActive()) {
profile = sb.crawler.getActive(h);
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) continue;
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) continue;
profile.putProfileEntry("crawlProfilesShow_list_", prop, true, dark, count, domlistlength);
prop.put("crawlProfilesShow_list_" + count + "_debug", debug ? 1 : 0);
if (debug) {
@ -877,47 +876,47 @@ public class Crawler_p {
* @throws IOException
* @throws FileNotFoundException
*/
private static List<AnchorURL> crawlingFileStart(final File crawlingFile, int timezoneOffset,
final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
List<AnchorURL> hyperlinks_from_file;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, false);
if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/* Let's report here detailed error to help user when he selected a wrong file */
if(!crawlingFile.exists()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists");
}
if(!crawlingFile.isFile()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file");
}
if(!crawlingFile.canRead()) {
throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
}
}
if (crawlingFile != null) {
FileInputStream inStream = null;
try {
inStream = new FileInputStream(crawlingFile);
FileUtils.copy(inStream, writer);
} finally {
if(inStream != null) {
try {
inStream.close();
} catch(IOException ignoredException) {
ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath());
}
}
}
} else {
FileUtils.copy(crawlingFileContent, writer);
}
writer.close();
// get links and generate filter
hyperlinks_from_file = scraper.getAnchors();
return hyperlinks_from_file;
}
private static List<AnchorURL> crawlingFileStart(final File crawlingFile, int timezoneOffset,
final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
List<AnchorURL> hyperlinks_from_file;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, false);
if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/* Let's report here detailed error to help user when he selected a wrong file */
if(!crawlingFile.exists()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists");
}
if(!crawlingFile.isFile()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file");
}
if(!crawlingFile.canRead()) {
throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
}
}
if (crawlingFile != null) {
FileInputStream inStream = null;
try {
inStream = new FileInputStream(crawlingFile);
FileUtils.copy(inStream, writer);
} finally {
if(inStream != null) {
try {
inStream.close();
} catch(IOException ignoredException) {
ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath());
}
}
}
} else {
FileUtils.copy(crawlingFileContent, writer);
}
writer.close();
// get links and generate filter
hyperlinks_from_file = scraper.getAnchors();
return hyperlinks_from_file;
}
private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
if (!recrawlIfOlderCheck) return null;

@ -85,7 +85,9 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer {
connector.setName("httpd:"+Integer.toString(port));
connector.setIdleTimeout(9000); // timout in ms when no bytes send / received
connector.setAcceptQueueSize(128);
server.addConnector(connector);
// add ssl/https connector
boolean useSSL = sb.getConfigBool("server.https", false);
@ -202,6 +204,7 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer {
context.setServer(server);
context.setContextPath("/");
context.setHandler(handlers);
context.setMaxFormContentSize(1024 * 1024 * 10); // allow 10MB, large forms may be required during crawl starts with long lists
// make YaCy handlers (in context) and servlet context handlers available (both contain root context "/")
// logic: 1. YaCy handlers are called if request not handled (e.g. proxy) then servlets handle it

@ -3757,7 +3757,8 @@ public final class Switchboard extends serverSwitch {
if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason);
return;
}
final List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
final ArrayList<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
int maxthreads = 5 * Runtime.getRuntime().availableProcessors();
for (DigestURL url: rootURLs) {
final DigestURL turl = url;
Thread t = new Thread("Switchboard.stackURLs") {
@ -3769,7 +3770,13 @@ public final class Switchboard extends serverSwitch {
};
t.start();
stackthreads.add(t);
try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second!
if (stackthreads.size() > maxthreads) {
Thread w = stackthreads.get(0);
while (w.isAlive()) {
try {Thread.sleep(100);} catch (final InterruptedException e) {}
}
stackthreads.remove(0);
}
}
final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}

Loading…
Cancel
Save