some dns-timing changes:

since httpclient uses the domain-cache it is useful not to clean the
domain cache until crawling is running (domains are filled into this
cache)
On huge crawl-starts (eg. from file) my DNS did not follow the high
rates - so I reduced the rate and give some more time(-out)
pull/1/head
sixcooler 12 years ago
parent 15b1bb2513
commit 0cae420d8e

@ -74,8 +74,8 @@ public class Domains {
private static final String PRESENT = ""; private static final String PRESENT = "";
private static final Pattern LOCAL_PATTERNS = Pattern.compile("(10\\..*)|(127\\..*)|(172\\.(1[6-9]|2[0-9]|3[0-1])\\..*)|(169\\.254\\..*)|(192\\.168\\..*)|(localhost)|(\\[?\\:\\:1/.*)|(\\[?fc.*)|(\\[?fd.*)|(\\[?(fe80|0)\\:0\\:0\\:0\\:0\\:0\\:0\\:1.*)"); private static final Pattern LOCAL_PATTERNS = Pattern.compile("(10\\..*)|(127\\..*)|(172\\.(1[6-9]|2[0-9]|3[0-1])\\..*)|(169\\.254\\..*)|(192\\.168\\..*)|(localhost)|(\\[?\\:\\:1/.*)|(\\[?fc.*)|(\\[?fd.*)|(\\[?(fe80|0)\\:0\\:0\\:0\\:0\\:0\\:0\\:1.*)");
private static final int MAX_NAME_CACHE_HIT_SIZE = 100000; private static final int MAX_NAME_CACHE_HIT_SIZE = 10000;
private static final int MAX_NAME_CACHE_MISS_SIZE = 100000; private static final int MAX_NAME_CACHE_MISS_SIZE = 1000;
private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() * 2; private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() * 2;
// a dns cache // a dns cache
@ -782,7 +782,7 @@ public class Domains {
public InetAddress call() throws Exception { public InetAddress call() throws Exception {
return InetAddress.getByName(host); return InetAddress.getByName(host);
} }
}, 1000L, TimeUnit.MILLISECONDS, false); }, 3000L, TimeUnit.MILLISECONDS, false);
//ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone //ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
} }
//.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms"); //.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms");

@ -2009,7 +2009,7 @@ public final class Switchboard extends serverSwitch {
// clear caches // clear caches
if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords(); if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
Word.clearCache(); Word.clearCache();
Domains.clear(); // Domains.clear();
// clean up image stack // clean up image stack
ResultImages.clearQueues(); ResultImages.clearQueues();
@ -2274,7 +2274,10 @@ public final class Switchboard extends serverSwitch {
// if no crawl is running and processing is activated: // if no crawl is running and processing is activated:
// execute the (post-) processing steps for all entries that have a process tag assigned // execute the (post-) processing steps for all entries that have a process tag assigned
if (this.crawlQueues.coreCrawlJobSize() == 0) { if (this.crawlQueues.coreCrawlJobSize() == 0) {
if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches if (this.crawlQueues.noticeURL.isEmpty()) {
Domains.clear();
this.crawlQueues.noticeURL.clear(); // flushes more caches
}
postprocessingRunning = true; postprocessingRunning = true;
int proccount = 0; int proccount = 0;
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index); proccount += index.fulltext().getDefaultConfiguration().postprocessing(index);
@ -2827,7 +2830,7 @@ public final class Switchboard extends serverSwitch {
public void stackURLs(Set<DigestURL> rootURLs, final CrawlProfile profile, final Set<DigestURL> successurls, final Map<DigestURL,String> failurls) { public void stackURLs(Set<DigestURL> rootURLs, final CrawlProfile profile, final Set<DigestURL> successurls, final Map<DigestURL,String> failurls) {
if (rootURLs == null || rootURLs.size() == 0) return; if (rootURLs == null || rootURLs.size() == 0) return;
List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently final List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
for (DigestURL url: rootURLs) { for (DigestURL url: rootURLs) {
final DigestURL turl = url; final DigestURL turl = url;
Thread t = new Thread() { Thread t = new Thread() {
@ -2838,9 +2841,9 @@ public final class Switchboard extends serverSwitch {
}; };
t.start(); t.start();
stackthreads.add(t); stackthreads.add(t);
try {Thread.sleep(10);} catch (final InterruptedException e) {} // to prevent that this fires more than 100 connections pre second! try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second!
} }
long waitingtime = 1 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {} for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}
} }

Loading…
Cancel
Save