enabled crawl starts with very large sets of start urls

i.e. 10MB large url list with approx 0.5 million start points
pull/419/head
Michael Peter Christen 4 years ago
parent c623a3252e
commit e81b770f79

@ -25,6 +25,7 @@ import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.Writer; import java.io.Writer;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -267,6 +268,7 @@ public class Crawler_p {
Set<DigestURL> rootURLs = new HashSet<DigestURL>(); Set<DigestURL> rootURLs = new HashSet<DigestURL>();
String crawlName = ""; String crawlName = "";
if (crawlingFile == null) for (String crawlingStart: rootURLs0) { if (crawlingFile == null) for (String crawlingStart: rootURLs0) {
StringBuilder crawlNameBuilder = new StringBuilder(); // for large crawl queues this can be pretty large
if (crawlingStart == null || crawlingStart.length() == 0) continue; if (crawlingStart == null || crawlingStart.length() == 0) continue;
// add the prefix http:// if necessary // add the prefix http:// if necessary
int pos = crawlingStart.indexOf("://",0); int pos = crawlingStart.indexOf("://",0);
@ -276,12 +278,12 @@ public class Crawler_p {
try { try {
DigestURL crawlingStartURL = new DigestURL(crawlingStart); DigestURL crawlingStartURL = new DigestURL(crawlingStart);
rootURLs.add(crawlingStartURL); rootURLs.add(crawlingStartURL);
crawlName += ((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()) + ','; crawlNameBuilder.append((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()).append(',');
if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false; if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {
ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage()); ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage());
} }
crawlName = crawlNameBuilder.toString();
} else { } else {
crawlName = crawlingFile.getName(); crawlName = crawlingFile.getName();
} }
@ -415,13 +417,15 @@ public class Crawler_p {
// delete all error urls for that domain // delete all error urls for that domain
// and all urls for that host from the crawl queue // and all urls for that host from the crawl queue
List<String> deleteIDs = new ArrayList<>();
Set<String> hosthashes = new HashSet<String>(); Set<String> hosthashes = new HashSet<String>();
boolean anysmbftporpdf = false; boolean anysmbftporpdf = false;
for (DigestURL u : rootURLs) { for (DigestURL u : rootURLs) {
sb.index.fulltext().remove(u.hash()); deleteIDs.add(new String(u.hash()));
hosthashes.add(u.hosthash()); hosthashes.add(u.hosthash());
if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true; if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true;
} }
sb.index.fulltext().remove(deleteIDs);
sb.crawlQueues.removeHosts(hosthashes); sb.crawlQueues.removeHosts(hosthashes);
sb.index.fulltext().commit(true); sb.index.fulltext().commit(true);
@ -458,8 +462,10 @@ public class Crawler_p {
// check if the crawl filter works correctly // check if the crawl filter works correctly
try { try {
Pattern mmp = Pattern.compile(newcrawlingMustMatch); Pattern mmp = Pattern.compile(newcrawlingMustMatch);
int maxcheck = 100;
for (DigestURL u: rootURLs) { for (DigestURL u: rootURLs) {
assert mmp.matcher(u.toNormalform(true)).matches() : "pattern " + mmp.toString() + " does not match url " + u.toNormalform(true); assert mmp.matcher(u.toNormalform(true)).matches() : "pattern " + mmp.toString() + " does not match url " + u.toNormalform(true);
if (maxcheck-- <= 0) break;
} }
} catch (final PatternSyntaxException e) { } catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url prop.put("info", "4"); // crawlfilter does not match url
@ -590,9 +596,6 @@ public class Crawler_p {
hasCrawlstartDataOK = false; hasCrawlstartDataOK = false;
prop.put("info", "9"); prop.put("info", "9");
} }
} }
// prepare a new crawling profile // prepare a new crawling profile
@ -645,7 +648,6 @@ public class Crawler_p {
profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
post.getBoolean("crawlerAlwaysCheckMediaType")); post.getBoolean("crawlerAlwaysCheckMediaType"));
handle = ASCII.getBytes(profile.handle()); handle = ASCII.getBytes(profile.handle());
// before we fire up a new crawl, we make sure that another crawl with the same name is not running // before we fire up a new crawl, we make sure that another crawl with the same name is not running
@ -659,10 +661,8 @@ public class Crawler_p {
handle = null; handle = null;
} }
// start the crawl // start the crawl
if(hasCrawlstartDataOK) { if (hasCrawlstartDataOK) {
final boolean wontReceiptRemoteRsults = crawlOrder && !sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false); final boolean wontReceiptRemoteRsults = crawlOrder && !sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false);
if ("url".equals(crawlingMode)) { if ("url".equals(crawlingMode)) {
@ -710,7 +710,6 @@ public class Crawler_p {
} }
if (successurls.size() > 0) { if (successurls.size() > 0) {
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults); prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
} }
} else if ("sitemap".equals(crawlingMode)) { } else if ("sitemap".equals(crawlingMode)) {

@ -85,8 +85,10 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer {
connector.setName("httpd:"+Integer.toString(port)); connector.setName("httpd:"+Integer.toString(port));
connector.setIdleTimeout(9000); // timout in ms when no bytes send / received connector.setIdleTimeout(9000); // timout in ms when no bytes send / received
connector.setAcceptQueueSize(128); connector.setAcceptQueueSize(128);
server.addConnector(connector); server.addConnector(connector);
// add ssl/https connector // add ssl/https connector
boolean useSSL = sb.getConfigBool("server.https", false); boolean useSSL = sb.getConfigBool("server.https", false);
@ -202,6 +204,7 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer {
context.setServer(server); context.setServer(server);
context.setContextPath("/"); context.setContextPath("/");
context.setHandler(handlers); context.setHandler(handlers);
context.setMaxFormContentSize(1024 * 1024 * 10); // allow 10MB, large forms may be required during crawl starts with long lists
// make YaCy handlers (in context) and servlet context handlers available (both contain root context "/") // make YaCy handlers (in context) and servlet context handlers available (both contain root context "/")
// logic: 1. YaCy handlers are called if request not handled (e.g. proxy) then servlets handle it // logic: 1. YaCy handlers are called if request not handled (e.g. proxy) then servlets handle it

@ -3757,7 +3757,8 @@ public final class Switchboard extends serverSwitch {
if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason); if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason);
return; return;
} }
final List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently final ArrayList<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
int maxthreads = 5 * Runtime.getRuntime().availableProcessors();
for (DigestURL url: rootURLs) { for (DigestURL url: rootURLs) {
final DigestURL turl = url; final DigestURL turl = url;
Thread t = new Thread("Switchboard.stackURLs") { Thread t = new Thread("Switchboard.stackURLs") {
@ -3769,7 +3770,13 @@ public final class Switchboard extends serverSwitch {
}; };
t.start(); t.start();
stackthreads.add(t); stackthreads.add(t);
try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second! if (stackthreads.size() > maxthreads) {
Thread w = stackthreads.get(0);
while (w.isAlive()) {
try {Thread.sleep(100);} catch (final InterruptedException e) {}
}
stackthreads.remove(0);
}
} }
final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {} for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}

Loading…
Cancel
Save