From 5bb52f79be6a6858a1977b32af3df67ec753e748 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 23 Nov 2014 20:09:32 +0100 Subject: [PATCH] reduce number of calls to queue.size() because that may be a bottleneck during crawling --- htroot/yacy/urls.java | 2 +- source/net/yacy/crawler/HostBalancer.java | 17 ++++++---- source/net/yacy/crawler/data/CrawlQueues.java | 4 +-- source/net/yacy/crawler/data/NoticedURL.java | 31 +++++++++++++------ 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 0054ea452..02232c845 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -66,7 +66,7 @@ public class urls { DigestURL referrer; while ((maxCount > 0) && (System.currentTimeMillis() < timeout) && - (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) { + (!sb.crawlQueues.noticeURL.isEmpty(stackType))) { try { entry = sb.crawlQueues.noticeURL.pop(stackType, false, sb.crawler, sb.robots); } catch (final IOException e) { diff --git a/source/net/yacy/crawler/HostBalancer.java b/source/net/yacy/crawler/HostBalancer.java index d09f718be..f3421ddb6 100644 --- a/source/net/yacy/crawler/HostBalancer.java +++ b/source/net/yacy/crawler/HostBalancer.java @@ -86,7 +86,7 @@ public class HostBalancer implements Balancer { for (String address: list) try { File queuePath = new File(this.hostsPath, address); HostQueue queue = new HostQueue(queuePath, this.queues.size() > this.onDemandLimit, this.exceed134217727); - if (queue.size() == 0) { + if (queue.isEmpty()) { queue.close(); FileUtils.deletedelete(queuePath); } else { @@ -127,7 +127,9 @@ public class HostBalancer implements Balancer { @Override public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException { int c = 0; - for (HostQueue queue: this.queues.values()) c += queue.removeAllByProfileHandle(profileHandle, timeout); + for (HostQueue queue: this.queues.values()) { + c += queue.removeAllByProfileHandle(profileHandle, timeout); + } return c; } @@ -187,13 +189,17 @@ public class HostBalancer implements Balancer { @Override public int size() { int c = 0; - for (HostQueue queue: this.queues.values()) c += queue.size(); + for (HostQueue queue: this.queues.values()) { + c += queue.size(); + } return c; } @Override public boolean isEmpty() { - for (HostQueue queue: this.queues.values()) if (!queue.isEmpty()) return false; + for (HostQueue queue: this.queues.values()) { + if (!queue.isEmpty()) return false; + } return true; } @@ -401,8 +407,7 @@ public class HostBalancer implements Balancer { } } - int size = rhq.size(); - if (size == 0) { + if (rhq.isEmpty()) { synchronized (this) { this.queues.remove(rhh); } diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 803959c91..8ff9ff6c3 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -265,14 +265,14 @@ public class CrawlQueues { // do a local crawl Request urlEntry; - while (this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { + while (!this.noticeURL.isEmpty(NoticedURL.StackType.LOCAL) || !this.noticeURL.isEmpty(NoticedURL.StackType.NOLOAD)) { final String stats = "LOCALCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; try { - if (this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { + if (!this.noticeURL.isEmpty(NoticedURL.StackType.NOLOAD)) { // get one entry that will not be loaded, just indexed urlEntry = this.noticeURL.pop(NoticedURL.StackType.NOLOAD, true, this.sb.crawler, this.sb.robots); if (urlEntry == null) { diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index c49f6aa85..f2b552941 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -295,17 +295,28 @@ public class NoticedURL { int s; Request entry; int errors = 0; - while ((s = balancer.size()) > 0) { + while (!balancer.isEmpty()) { entry = balancer.pop(delay, cs, robots); - if (entry == null) { - if (s > balancer.size()) continue; - errors++; - if (errors < 100) continue; - final int aftersize = balancer.size(); - balancer.clear(); // the balancer is broken and cannot shrink - ConcurrentLog.warn("BALANCER", "entry is null, balancer cannot shrink (bevore pop = " + s + ", after pop = " + aftersize + "); reset of balancer"); - } - return entry; + if (entry != null) return entry; + + // the balancer was supposed to be not empty. Check this again + // it may be possible that another process has taken all + s = balancer.size(); // this time read the size to find errors + if (s == 0) return null; // the balancer is actually empty! + + // if the balancer is not empty, try again + entry = balancer.pop(delay, cs, robots); + if (entry != null) return entry; + + if (s > balancer.size()) continue; // the balancer has shrinked, thats good, it will terminate + errors++; // bad, if the size does not shrink we are in danger to not terminate + if (errors < 100) continue; // there is the possibility that it is not a bug but concurrency, so just ignore it for some time + + // at this point we consider the balancer to be broken + final int aftersize = balancer.size(); // get the amount of data that we loose + balancer.clear(); // the balancer is broken and cannot shrink + ConcurrentLog.warn("BALANCER", "balancer cannot shrink (bevore pop = " + s + ", after pop = " + aftersize + "); reset of balancer"); + return null; } return null; }