From 0cae420d8ea189885d574be588489983fe0e47a9 Mon Sep 17 00:00:00 2001 From: sixcooler Date: Wed, 25 Sep 2013 15:01:28 +0200 Subject: [PATCH] some dns-timing changes: since httpclient uses the domain-cache it is useful not to clean the domain cache until crawling is running (domains are filled into this cache) On huge crawl-starts (eg. from file) my DNS did not follow the high rates - so I reduced the rate and give some more time(-out) --- source/net/yacy/cora/protocol/Domains.java | 6 +++--- source/net/yacy/search/Switchboard.java | 13 ++++++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index c1c6b6d18..70a960f5b 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -74,8 +74,8 @@ public class Domains { private static final String PRESENT = ""; private static final Pattern LOCAL_PATTERNS = Pattern.compile("(10\\..*)|(127\\..*)|(172\\.(1[6-9]|2[0-9]|3[0-1])\\..*)|(169\\.254\\..*)|(192\\.168\\..*)|(localhost)|(\\[?\\:\\:1/.*)|(\\[?fc.*)|(\\[?fd.*)|(\\[?(fe80|0)\\:0\\:0\\:0\\:0\\:0\\:0\\:1.*)"); - private static final int MAX_NAME_CACHE_HIT_SIZE = 100000; - private static final int MAX_NAME_CACHE_MISS_SIZE = 100000; + private static final int MAX_NAME_CACHE_HIT_SIZE = 10000; + private static final int MAX_NAME_CACHE_MISS_SIZE = 1000; private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() * 2; // a dns cache @@ -782,7 +782,7 @@ public class Domains { public InetAddress call() throws Exception { return InetAddress.getByName(host); } - }, 1000L, TimeUnit.MILLISECONDS, false); + }, 3000L, TimeUnit.MILLISECONDS, false); //ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone } //.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms"); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 36f4930bc..83a321052 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2009,7 +2009,7 @@ public final class Switchboard extends serverSwitch { // clear caches if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords(); Word.clearCache(); - Domains.clear(); + // Domains.clear(); // clean up image stack ResultImages.clearQueues(); @@ -2274,7 +2274,10 @@ public final class Switchboard extends serverSwitch { // if no crawl is running and processing is activated: // execute the (post-) processing steps for all entries that have a process tag assigned if (this.crawlQueues.coreCrawlJobSize() == 0) { - if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches + if (this.crawlQueues.noticeURL.isEmpty()) { + Domains.clear(); + this.crawlQueues.noticeURL.clear(); // flushes more caches + } postprocessingRunning = true; int proccount = 0; proccount += index.fulltext().getDefaultConfiguration().postprocessing(index); @@ -2827,7 +2830,7 @@ public final class Switchboard extends serverSwitch { public void stackURLs(Set rootURLs, final CrawlProfile profile, final Set successurls, final Map failurls) { if (rootURLs == null || rootURLs.size() == 0) return; - List stackthreads = new ArrayList(); // do this concurrently + final List stackthreads = new ArrayList(); // do this concurrently for (DigestURL url: rootURLs) { final DigestURL turl = url; Thread t = new Thread() { @@ -2838,9 +2841,9 @@ public final class Switchboard extends serverSwitch { }; t.start(); stackthreads.add(t); - try {Thread.sleep(10);} catch (final InterruptedException e) {} // to prevent that this fires more than 100 connections pre second! + try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second! } - long waitingtime = 1 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out + final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {} }