some dns-timing changes:

since httpclient uses the domain-cache it is useful not to clean the
domain cache until crawling is running (domains are filled into this
cache)
On huge crawl-starts (eg. from file) my DNS did not follow the high
rates - so I reduced the rate and give some more time(-out)
pull/1/head
sixcooler 12 years ago
parent 15b1bb2513
commit 0cae420d8e

@ -74,8 +74,8 @@ public class Domains {
private static final String PRESENT = "";
private static final Pattern LOCAL_PATTERNS = Pattern.compile("(10\\..*)|(127\\..*)|(172\\.(1[6-9]|2[0-9]|3[0-1])\\..*)|(169\\.254\\..*)|(192\\.168\\..*)|(localhost)|(\\[?\\:\\:1/.*)|(\\[?fc.*)|(\\[?fd.*)|(\\[?(fe80|0)\\:0\\:0\\:0\\:0\\:0\\:0\\:1.*)");
private static final int MAX_NAME_CACHE_HIT_SIZE = 100000;
private static final int MAX_NAME_CACHE_MISS_SIZE = 100000;
private static final int MAX_NAME_CACHE_HIT_SIZE = 10000;
private static final int MAX_NAME_CACHE_MISS_SIZE = 1000;
private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() * 2;
// a dns cache
@ -782,7 +782,7 @@ public class Domains {
public InetAddress call() throws Exception {
return InetAddress.getByName(host);
}
}, 1000L, TimeUnit.MILLISECONDS, false);
}, 3000L, TimeUnit.MILLISECONDS, false);
//ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
}
//.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms");

@ -2009,7 +2009,7 @@ public final class Switchboard extends serverSwitch {
// clear caches
if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
Word.clearCache();
Domains.clear();
// Domains.clear();
// clean up image stack
ResultImages.clearQueues();
@ -2274,7 +2274,10 @@ public final class Switchboard extends serverSwitch {
// if no crawl is running and processing is activated:
// execute the (post-) processing steps for all entries that have a process tag assigned
if (this.crawlQueues.coreCrawlJobSize() == 0) {
if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches
if (this.crawlQueues.noticeURL.isEmpty()) {
Domains.clear();
this.crawlQueues.noticeURL.clear(); // flushes more caches
}
postprocessingRunning = true;
int proccount = 0;
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index);
@ -2827,7 +2830,7 @@ public final class Switchboard extends serverSwitch {
public void stackURLs(Set<DigestURL> rootURLs, final CrawlProfile profile, final Set<DigestURL> successurls, final Map<DigestURL,String> failurls) {
if (rootURLs == null || rootURLs.size() == 0) return;
List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
final List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
for (DigestURL url: rootURLs) {
final DigestURL turl = url;
Thread t = new Thread() {
@ -2838,9 +2841,9 @@ public final class Switchboard extends serverSwitch {
};
t.start();
stackthreads.add(t);
try {Thread.sleep(10);} catch (final InterruptedException e) {} // to prevent that this fires more than 100 connections pre second!
try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second!
}
long waitingtime = 1 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}
}

Loading…
Cancel
Save