enhanced crawler/balancer: better remaining waiting-time guessing

pull/1/head
Michael Peter Christen 13 years ago
parent f150bc218b
commit 70505107ca

@ -375,7 +375,7 @@ public class Balancer {
synchronized (this) { synchronized (this) {
byte[] failhash = null; byte[] failhash = null;
while (!this.urlFileIndex.isEmpty()) { while (!this.urlFileIndex.isEmpty()) {
byte[] nexthash = getbest(); byte[] nexthash = getbest(robots);
if (nexthash == null) return null; if (nexthash == null) return null;
// check minimumDelta and if necessary force a sleep // check minimumDelta and if necessary force a sleep
@ -442,7 +442,7 @@ public class Balancer {
return crawlEntry; return crawlEntry;
} }
private byte[] getbest() { private byte[] getbest(final RobotsTxt robots) {
// check if we need to get entries from the file index // check if we need to get entries from the file index
try { try {
@ -469,7 +469,23 @@ public class Balancer {
final byte[] n = entry.getValue().removeOne(); final byte[] n = entry.getValue().removeOne();
if (n == null) continue; if (n == null) continue;
final long w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
long w;
Row.Entry rowEntry;
try {
rowEntry=(n == null) ? null : this.urlFileIndex.get(n, false);
if (rowEntry == null) {
w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
} else {
Request crawlEntry = new Request(rowEntry);
w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
}
} catch (IOException e1) {
w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
}
if (w < smallestWaiting) { if (w < smallestWaiting) {
smallestWaiting = w; smallestWaiting = w;
besturlhash = n; besturlhash = n;

@ -36,6 +36,8 @@ import net.yacy.kelondro.util.MemoryControl;
public class Latency { public class Latency {
private final static int DEFAULT_AVERAGE = 300;
// the map is a mapping from host names to host configurations // the map is a mapping from host names to host configurations
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>(); private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
@ -57,7 +59,7 @@ public class Latency {
if (host == null) return; if (host == null) return;
Host h = map.get(host); Host h = map.get(host);
if (h == null) { if (h == null) {
h = new Host(host, 3000); h = new Host(host, DEFAULT_AVERAGE);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear(); if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h); map.put(host, h);
} else { } else {
@ -70,7 +72,7 @@ public class Latency {
if (host == null) return; if (host == null) return;
Host h = map.get(host); Host h = map.get(host);
if (h == null) { if (h == null) {
h = new Host(host, 3000); h = new Host(host, DEFAULT_AVERAGE);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear(); if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h); map.put(host, h);
} else { } else {
@ -121,31 +123,28 @@ public class Latency {
* which expresses how long the time is over the minimum waiting time. * which expresses how long the time is over the minimum waiting time.
*/ */
public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) { public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) {
if (hostname == null) return 0; if (hostname == null) return Long.MIN_VALUE;
final Host host = map.get(hostname);
if (host == null) return 0;
// the time since last access to the domain is the basis of the remaining calculation // first check if the domain was _ever_ accessed before
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); final Host host = map.get(hostname);
if (host == null) return Long.MIN_VALUE; // no delay if host is new
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
final boolean local = Domains.isLocal(hostname, null); final boolean local = Domains.isLocal(hostname, null);
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; if (local) return minimumLocalDelta;
long waiting = minimumGlobalDelta;
// if we have accessed the domain many times, get slower (the flux factor) // if we have accessed the domain many times, get slower (the flux factor)
if (!local) waiting += host.flux(waiting); waiting += host.flux(waiting);
// use the access latency as rule how fast we can access the server // use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to // this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses // consider so many external accesses
waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2); waiting = Math.max(waiting, host.average() * 2);
// prevent that that a robots file can stop our indexer completely // the time since last access to the domain is the basis of the remaining calculation
waiting = Math.min(60000, waiting); final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
return Math.max(0, Math.min(60000, waiting) - timeSinceLastAccess);
// return time that is remaining
//System.out.println("Latency: " + (waiting - timeSinceLastAccess));
return Math.max(0, waiting - timeSinceLastAccess);
} }
/** /**
@ -169,10 +168,7 @@ public class Latency {
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal(); final boolean local = url.isLocal();
if (local) return minimumLocalDelta; if (local) return minimumLocalDelta;
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; long waiting = minimumGlobalDelta;
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// for CGI accesses, we double the minimum time // for CGI accesses, we double the minimum time
// mostly there is a database access in the background // mostly there is a database access in the background
@ -180,33 +176,29 @@ public class Latency {
if (url.isCGI()) waiting = waiting * 2; if (url.isCGI()) waiting = waiting * 2;
// if we have accessed the domain many times, get slower (the flux factor) // if we have accessed the domain many times, get slower (the flux factor)
if (!local && host != null) waiting += host.flux(waiting); waiting += host.flux(waiting);
// find the delay as given by robots.txt on target site
long robotsDelay = 0;
if (!local) {
RobotsTxtEntry robotsEntry;
try {
robotsEntry = robots.getEntry(url, thisAgents);
} catch (final IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return 0; // no limits if granted exclusively for this peer
}
waiting = Math.max(waiting, robotsDelay);
// use the access latency as rule how fast we can access the server // use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to // this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses // consider so many external accesses
waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2); waiting = Math.max(waiting, host.average() * 2);
// prevent that that a robots file can stop our indexer completely // find the delay as given by robots.txt on target site
waiting = Math.min(60000, waiting); long robotsDelay = 0;
RobotsTxtEntry robotsEntry;
try {
robotsEntry = robots.getEntry(url, thisAgents);
} catch (final IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return 0; // no limits if granted exclusively for this peer
// return time that is remaining waiting = Math.max(waiting, robotsDelay);
//System.out.println("Latency: " + (waiting - timeSinceLastAccess));
return Math.max(0, waiting - timeSinceLastAccess); // the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
return Math.max(0, Math.min(60000, waiting) - timeSinceLastAccess);
} }
@ -214,46 +206,51 @@ public class Latency {
// first check if the domain was _ever_ accessed before // first check if the domain was _ever_ accessed before
final Host host = host(url); final Host host = host(url);
if (host == null) return "host " + host + " never accessed before -> 0"; // no delay if host is new if (host == null) return "host " + host + " never accessed before -> Long.MIN_VALUE"; // no delay if host is new
final StringBuilder s = new StringBuilder(50); final StringBuilder s = new StringBuilder(50);
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal(); final boolean local = url.isLocal();
final long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; if (local) return "local host -> minimum local: " + minimumLocalDelta;
long waiting = minimumGlobalDelta;
s.append("minimumDelta = ").append(waiting); s.append("minimumDelta = ").append(waiting);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = (host == null) ? 0 : System.currentTimeMillis() - host.lastacc();
s.append(", timeSinceLastAccess = ").append(timeSinceLastAccess);
// for CGI accesses, we double the minimum time // for CGI accesses, we double the minimum time
// mostly there is a database access in the background // mostly there is a database access in the background
// which creates a lot of unwanted IO on target site // which creates a lot of unwanted IO on target site
if (url.isCGI()) s.append(", isCGI = true -> double"); if (url.isCGI()) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
// if we have accessed the domain many times, get slower (the flux factor) // if we have accessed the domain many times, get slower (the flux factor)
if (!local && host != null) s.append(", flux = ").append(host.flux(waiting)); long flux = host.flux(waiting);
waiting += flux;
s.append(", flux = ").append(flux);
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
s.append(", host.average = ").append(host.average());
waiting = Math.max(waiting, host.average() * 2);
// find the delay as given by robots.txt on target site // find the delay as given by robots.txt on target site
long robotsDelay = 0; long robotsDelay = 0;
if (!local) { RobotsTxtEntry robotsEntry;
RobotsTxtEntry robotsEntry; try {
try { robotsEntry = robots.getEntry(url, thisAgents);
robotsEntry = robots.getEntry(url, thisAgents); } catch (final IOException e) {
} catch (final IOException e) { robotsEntry = null;
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
} }
s.append(", robots.delay = ").append(robotsDelay); robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
// use the access latency as rule how fast we can access the server waiting = Math.max(waiting, robotsDelay);
// this applies also to localhost, but differently, because it is not necessary to s.append(", robots.delay = ").append(robotsDelay);
// consider so many external accesses
if (host != null) s.append(", host.average = ").append(host.average());
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
s.append(", ((waitig = ").append(waiting);
s.append(") - (timeSinceLastAccess = ").append(timeSinceLastAccess).append(")) = ");
s.append(waiting - timeSinceLastAccess);
return s.toString(); return s.toString();
} }

Loading…
Cancel
Save