reduce number of calls to queue.size() because that may be a bottleneck

during crawling
pull/1/head
Michael Peter Christen 10 years ago
parent 4920ab7b76
commit 5bb52f79be

@ -66,7 +66,7 @@ public class urls {
DigestURL referrer;
while ((maxCount > 0) &&
(System.currentTimeMillis() < timeout) &&
(sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) {
(!sb.crawlQueues.noticeURL.isEmpty(stackType))) {
try {
entry = sb.crawlQueues.noticeURL.pop(stackType, false, sb.crawler, sb.robots);
} catch (final IOException e) {

@ -86,7 +86,7 @@ public class HostBalancer implements Balancer {
for (String address: list) try {
File queuePath = new File(this.hostsPath, address);
HostQueue queue = new HostQueue(queuePath, this.queues.size() > this.onDemandLimit, this.exceed134217727);
if (queue.size() == 0) {
if (queue.isEmpty()) {
queue.close();
FileUtils.deletedelete(queuePath);
} else {
@ -127,7 +127,9 @@ public class HostBalancer implements Balancer {
@Override
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
int c = 0;
for (HostQueue queue: this.queues.values()) c += queue.removeAllByProfileHandle(profileHandle, timeout);
for (HostQueue queue: this.queues.values()) {
c += queue.removeAllByProfileHandle(profileHandle, timeout);
}
return c;
}
@ -187,13 +189,17 @@ public class HostBalancer implements Balancer {
@Override
public int size() {
int c = 0;
for (HostQueue queue: this.queues.values()) c += queue.size();
for (HostQueue queue: this.queues.values()) {
c += queue.size();
}
return c;
}
@Override
public boolean isEmpty() {
for (HostQueue queue: this.queues.values()) if (!queue.isEmpty()) return false;
for (HostQueue queue: this.queues.values()) {
if (!queue.isEmpty()) return false;
}
return true;
}
@ -401,8 +407,7 @@ public class HostBalancer implements Balancer {
}
}
int size = rhq.size();
if (size == 0) {
if (rhq.isEmpty()) {
synchronized (this) {
this.queues.remove(rhh);
}

@ -265,14 +265,14 @@ public class CrawlQueues {
// do a local crawl
Request urlEntry;
while (this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
while (!this.noticeURL.isEmpty(NoticedURL.StackType.LOCAL) || !this.noticeURL.isEmpty(NoticedURL.StackType.NOLOAD)) {
final String stats = "LOCALCRAWL[" +
this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) +
", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try {
if (this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
if (!this.noticeURL.isEmpty(NoticedURL.StackType.NOLOAD)) {
// get one entry that will not be loaded, just indexed
urlEntry = this.noticeURL.pop(NoticedURL.StackType.NOLOAD, true, this.sb.crawler, this.sb.robots);
if (urlEntry == null) {

@ -295,17 +295,28 @@ public class NoticedURL {
int s;
Request entry;
int errors = 0;
while ((s = balancer.size()) > 0) {
while (!balancer.isEmpty()) {
entry = balancer.pop(delay, cs, robots);
if (entry == null) {
if (s > balancer.size()) continue;
errors++;
if (errors < 100) continue;
final int aftersize = balancer.size();
balancer.clear(); // the balancer is broken and cannot shrink
ConcurrentLog.warn("BALANCER", "entry is null, balancer cannot shrink (bevore pop = " + s + ", after pop = " + aftersize + "); reset of balancer");
}
return entry;
if (entry != null) return entry;
// the balancer was supposed to be not empty. Check this again
// it may be possible that another process has taken all
s = balancer.size(); // this time read the size to find errors
if (s == 0) return null; // the balancer is actually empty!
// if the balancer is not empty, try again
entry = balancer.pop(delay, cs, robots);
if (entry != null) return entry;
if (s > balancer.size()) continue; // the balancer has shrinked, thats good, it will terminate
errors++; // bad, if the size does not shrink we are in danger to not terminate
if (errors < 100) continue; // there is the possibility that it is not a bug but concurrency, so just ignore it for some time
// at this point we consider the balancer to be broken
final int aftersize = balancer.size(); // get the amount of data that we loose
balancer.clear(); // the balancer is broken and cannot shrink
ConcurrentLog.warn("BALANCER", "balancer cannot shrink (bevore pop = " + s + ", after pop = " + aftersize + "); reset of balancer");
return null;
}
return null;
}

Loading…
Cancel
Save