fix for deadlocks in crawler

pull/1/head
Michael Peter Christen 11 years ago
parent 7a2f3e2353
commit 9c6228d948

@ -201,10 +201,11 @@ public class HostBalancer implements Balancer {
* @throws SpaceExceededException
*/
@Override
public synchronized String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
if (this.has(entry.url().hash())) return "double occurrence";
depthCache.put(entry.url().hash(), entry.depth());
String hosthash = ASCII.String(entry.url().hash(), 6, 6);
synchronized (this) {
HostQueue queue = this.queues.get(hosthash);
if (queue == null) {
queue = new HostQueue(this.hostsPath, entry.url().getHost(), entry.url().getPort(), this.queues.size() > 100, this.exceed134217727);
@ -213,6 +214,7 @@ public class HostBalancer implements Balancer {
}
return queue.push(entry, profile, robots);
}
}
/**
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
@ -227,8 +229,12 @@ public class HostBalancer implements Balancer {
* @throws SpaceExceededException
*/
@Override
public synchronized Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws IOException {
public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws IOException {
tryagain: while (true) try {
HostQueue rhq = null;
String rhh = null;
synchronized (this) {
if (this.roundRobinHostHashes.size() == 0) {
// refresh the round-robin cache
this.roundRobinHostHashes.addAll(this.queues.keySet());
@ -266,41 +272,50 @@ public class HostBalancer implements Balancer {
if (this.roundRobinHostHashes.size() == 0) return null;
// first strategy: get one entry which does not need sleep time
for (String nextHH: this.roundRobinHostHashes) {
HostQueue hq = this.queues.get(nextHH);
int delta = Latency.waitingRemainingGuessed(hq.getHost(), DigestURL.hosthash(hq.getHost(), hq.getPort()), robots, ClientIdentification.yacyInternetCrawlerAgent);
Iterator<String> nhhi = this.roundRobinHostHashes.iterator();
nosleep: while (nhhi.hasNext()) {
rhh = nhhi.next();
rhq = this.queues.get(rhh);
if (rhq == null) {
nhhi.remove();
continue nosleep;
}
int delta = Latency.waitingRemainingGuessed(rhq.getHost(), DigestURL.hosthash(rhq.getHost(), rhq.getPort()), robots, ClientIdentification.yacyInternetCrawlerAgent);
if (delta <= 10 || this.roundRobinHostHashes.size() == 1) {
this.roundRobinHostHashes.remove(nextHH);
Request request = hq == null ? null : hq.pop(delay, cs, robots);
int size = hq == null ? 0 : hq.size();
if (size == 0) {
hq.close();
this.queues.remove(nextHH);
}
if (request != null) return request;
nhhi.remove();
break nosleep;
}
}
if (rhq == null) {
// second strategy: take from the largest stack and clean round robin cache
int largest = Integer.MIN_VALUE;
String nextHH = null;
for (String h: this.roundRobinHostHashes) {
HostQueue hq = this.queues.get(h);
if (hq != null) {
int s = hq.size();
if (s > largest) {
largest = s;
nextHH = h;
rhh = h;
}
}
}
this.roundRobinHostHashes.clear(); // start from the beginning next time
HostQueue hq = this.queues.get(nextHH);
Request request = hq == null ? null : hq.pop(delay, cs, robots);
if (hq != null && hq.size() == 0) {
hq.close();
this.queues.remove(nextHH);
rhq = this.queues.get(rhh);
}
}
if (rhq == null) continue tryagain;
Request request = rhq.pop(delay, cs, robots); // this pop is outside of synchronization to prevent blocking of pushes
int size = rhq.size();
if (size == 0) {
synchronized (this) {
this.queues.remove(rhh);
}
rhq.close();
}
if (request == null) continue tryagain;
return request;
} catch (ConcurrentModificationException e) {
continue tryagain;

@ -253,7 +253,7 @@ public class HostQueue implements Balancer {
}
this.depthStacks.clear();
String[] l = this.hostPath.list();
for (String s: l) {
if (l != null) for (String s: l) {
new File(this.hostPath, s).delete();
}
this.hostPath.delete();

@ -393,7 +393,7 @@ public class CrawlQueues {
*/
private String loadIsPossible(final StackType stackType) {
//System.out.println("stacksize = " + noticeURL.stackSize(stackType));
if (this.noticeURL.stackSize(stackType) == 0) {
if (this.noticeURL.isEmpty(stackType)) {
//log.logDebug("GlobalCrawl: queue is empty");
return "stack is empty";
}

@ -126,6 +126,16 @@ public class NoticedURL {
return true;
}
public boolean isEmpty(final StackType stackType) {
switch (stackType) {
case NOLOAD: return (this.noloadStack == null) ? true : this.noloadStack.isEmpty();
case LOCAL: return (this.coreStack == null) ? true : this.coreStack.isEmpty();
case GLOBAL: return (this.limitStack == null) ? true : this.limitStack.isEmpty();
case REMOTE: return (this.remoteStack == null) ? true : this.remoteStack.isEmpty();
default: return true;
}
}
public int stackSize(final StackType stackType) {
switch (stackType) {
case NOLOAD: return (this.noloadStack == null) ? 0 : this.noloadStack.size();

Loading…
Cancel
Save