protection against crawl balancer failure:

a minimum of 500 milliseconds distance between two acesses
to the same domain is now ensured

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3354 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 1f1f398bfa
commit 8c1d2e0227

@ -58,10 +58,12 @@ public class plasmaCrawlBalancer {
private kelondroStack stack;
private HashMap domainStacks;
private HashMap domainAccess;
public plasmaCrawlBalancer(File stackFile) {
stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0));
domainStacks = new HashMap();
domainAccess = new HashMap();
}
public void close() {
@ -135,17 +137,36 @@ public class plasmaCrawlBalancer {
}
}
public byte[] get() throws IOException {
public String get(long minimumDelta) throws IOException {
// returns an url-hash from the stack
synchronized (domainStacks) {
String entry = null;
if (stack.size() > 0) {
return stack.pop().getColBytes(0);
entry = new String(stack.pop().getColBytes(0));
} else if (domainStacks.size() > 0) {
flushOnce();
return stack.pop().getColBytes(0);
} else {
return null;
entry = new String(stack.pop().getColBytes(0));
}
if ((minimumDelta > 0) && (entry != null)) {
// check if the time after retrieval of last hash from same
// domain is not shorter than the minimumDelta
String domhash = entry.substring(6);
Long lastAccess = (Long) domainAccess.get(domhash);
if (lastAccess != null) {
// this is not the first access of the same domain
long la = lastAccess.longValue();
if (System.currentTimeMillis() - la > minimumDelta) {
// force a busy waiting here
// in best case, this should never happen if the balancer works propertly
// this is only to protect against the worst case, where the crawler could
// behave in a DoS-manner
long sleeptime = System.currentTimeMillis() - la - minimumDelta;
if (sleeptime > 0) try {this.wait(sleeptime);} catch (InterruptedException e) {}
}
}
domainAccess.put(domhash, new Long(System.currentTimeMillis()));
}
return entry;
}
}

@ -76,6 +76,7 @@ public class plasmaCrawlNURL {
public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
public static final int STACK_TYPE_MUSIC = 13; // put on music stack
private static final long minimumDelta = 500; // the minimum time difference between access of the same domain
/**
* column length definition for the {@link plasmaURL#urlIndexFile} DB
*/
@ -432,7 +433,7 @@ public class plasmaCrawlNURL {
private Entry pop(plasmaCrawlBalancer balancer) throws IOException {
// this is a filo - pop
if (balancer.size() > 0) {
String hash = new String(balancer.get());
String hash = balancer.get(minimumDelta);
if (hash == null) throw new IOException("hash is null");
Entry e = new Entry(hash);
stackIndex.remove(e.hash);

Loading…
Cancel
Save