From c1c1be8f02f143dabd5560f80a1d072944e0cbbf Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 29 Apr 2014 19:50:33 +0200 Subject: [PATCH] fix for slow crawling and better logging in balancer --- source/net/yacy/crawler/HostBalancer.java | 19 +++++++++++++------ source/net/yacy/crawler/HostQueue.java | 12 +++++++----- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/source/net/yacy/crawler/HostBalancer.java b/source/net/yacy/crawler/HostBalancer.java index 91dc41c20..1ba50ce8c 100644 --- a/source/net/yacy/crawler/HostBalancer.java +++ b/source/net/yacy/crawler/HostBalancer.java @@ -60,6 +60,7 @@ import net.yacy.kelondro.index.RowHandleSet; */ public class HostBalancer implements Balancer { + private final static ConcurrentLog log = new ConcurrentLog("HostBalancer"); public final static HandleMap depthCache = new RowHandleMap(Word.commonHashLength, Word.commonHashOrder, 2, 8 * 1024 * 1024, "HostBalancer.DepthCache"); private final File hostsPath; @@ -258,17 +259,23 @@ public class HostBalancer implements Balancer { String s = i.next(); HostQueue hq = this.queues.get(s); if (hq == null) {i.remove(); continue smallstacks;} + int delta = Latency.waitingRemainingGuessed(hq.getHost(), s, robots, ClientIdentification.yacyInternetCrawlerAgent); + if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things + // to protect all small stacks which have a fast throughput, remove all with long waiting time + if (delta >= 1000) {i.remove(); continue smallstacks;} int size = hq.size(); if (singletonStacksExist) { - if (size != 1) {i.remove(); continue smallstacks;} - } else { - if (size > 10) {i.remove(); continue smallstacks;} + if (size != 1) {i.remove(); continue smallstacks;} // remove all non-singletons + } else /*smallStacksExist*/ { + if (size > 10) {i.remove(); continue smallstacks;} // remove all large stacks } - // to protect all small stacks which have a fast throughput, remove all with long wainting time - int delta = Latency.waitingRemainingGuessed(hq.getHost(), s, robots, ClientIdentification.yacyInternetCrawlerAgent); - if (delta >= 1000) {i.remove();} } } + if (this.roundRobinHostHashes.size() == 1) { + if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host"); + } else { + log.info("(re-)initialized the round-robin queue; " + this.roundRobinHostHashes.size() + " hosts."); + } } if (this.roundRobinHostHashes.size() == 0) return null; diff --git a/source/net/yacy/crawler/HostQueue.java b/source/net/yacy/crawler/HostQueue.java index 72636333b..66cca95fd 100644 --- a/source/net/yacy/crawler/HostQueue.java +++ b/source/net/yacy/crawler/HostQueue.java @@ -55,6 +55,8 @@ import net.yacy.search.Switchboard; public class HostQueue implements Balancer { + private final static ConcurrentLog log = new ConcurrentLog("HostQueue"); + public static final String indexSuffix = ".stack"; private static final int EcoFSBufferSize = 1000; private static final int objectIndexBufferSize = 1000; @@ -105,7 +107,7 @@ public class HostQueue implements Balancer { if (!(this.hostPath.exists())) this.hostPath.mkdirs(); this.depthStacks = new TreeMap(); int size = openAllStacks(); - ConcurrentLog.info("Balancer", "opened HostQueue " + this.hostPath.getAbsolutePath() + " with " + size + " urls."); + if (log.isInfo()) log.info("opened HostQueue " + this.hostPath.getAbsolutePath() + " with " + size + " urls."); } public String getHost() { @@ -406,7 +408,7 @@ public class HostQueue implements Balancer { // check blacklist (again) because the user may have created blacklist entries after the queue has been filled if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) { - ConcurrentLog.fine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist."); + if (log.isFine()) log.fine("URL '" + crawlEntry.url() + "' is in blacklist."); continue mainloop; } @@ -414,7 +416,7 @@ public class HostQueue implements Balancer { // if not: return null. A calling method must handle the null value and try again profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle())); if (profileEntry == null) { - ConcurrentLog.fine("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); + if (log.isFine()) log.fine("no profile entry for handle " + crawlEntry.profileHandle()); continue mainloop; } @@ -432,7 +434,7 @@ public class HostQueue implements Balancer { // in best case, this should never happen if the balancer works properly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner - ConcurrentLog.info("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent)); + if (log.isInfo()) log.info("forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent)); long loops = sleeptime / 1000; long rest = sleeptime % 1000; if (loops < 3) { @@ -444,7 +446,7 @@ public class HostQueue implements Balancer { // must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}} for (int i = 0; i < loops; i++) { - ConcurrentLog.info("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); + if (log.isInfo()) log.info("waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); try {this.wait(1000); } catch (final InterruptedException e) {} } }