|
|
|
@ -41,17 +41,9 @@ import net.yacy.search.Switchboard;
|
|
|
|
|
|
|
|
|
|
public class Latency {
|
|
|
|
|
|
|
|
|
|
private final static int DEFAULT_AVERAGE_LATENCY = 500;
|
|
|
|
|
private final static int DEFAULT_MAX_SAME_HOST_IN_QUEUE = 20;
|
|
|
|
|
private final static float DEFAULT_LATENCY_FACTOR = 0.5f;
|
|
|
|
|
|
|
|
|
|
// the map is a mapping from host names to host configurations
|
|
|
|
|
private static final int mapMaxSize = 1000;
|
|
|
|
|
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
|
|
|
|
|
|
|
|
|
|
public static int defaultAverageLatency = DEFAULT_AVERAGE_LATENCY;
|
|
|
|
|
public static int MaxSameHostInQueue = DEFAULT_MAX_SAME_HOST_IN_QUEUE;
|
|
|
|
|
public static float latencyFactor = DEFAULT_LATENCY_FACTOR;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* update the latency entry after a host was selected for queueing into the loader
|
|
|
|
@ -64,7 +56,7 @@ public class Latency {
|
|
|
|
|
String hosthash = url.hosthash();
|
|
|
|
|
Host h = map.get(hosthash);
|
|
|
|
|
if (h == null) {
|
|
|
|
|
h = new Host(host, defaultAverageLatency, robotsCrawlDelay);
|
|
|
|
|
h = new Host(host, Switchboard.getSwitchboard().getConfigInt("crawler.defaultAverageLatency", 500), robotsCrawlDelay);
|
|
|
|
|
if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear();
|
|
|
|
|
map.put(hosthash, h);
|
|
|
|
|
}
|
|
|
|
@ -168,10 +160,10 @@ public class Latency {
|
|
|
|
|
// use the access latency as rule how fast we can access the server
|
|
|
|
|
// this applies also to localhost, but differently, because it is not necessary to
|
|
|
|
|
// consider so many external accesses
|
|
|
|
|
waiting = Math.max(waiting, (int) (host.average() * latencyFactor));
|
|
|
|
|
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f)));
|
|
|
|
|
|
|
|
|
|
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
|
|
|
|
|
if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > MaxSameHostInQueue) waiting += 5000;
|
|
|
|
|
if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000;
|
|
|
|
|
|
|
|
|
|
// the time since last access to the domain is the basis of the remaining calculation
|
|
|
|
|
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
|
|
|
|
@ -211,10 +203,10 @@ public class Latency {
|
|
|
|
|
if (!local) waiting += host.flux(waiting);
|
|
|
|
|
|
|
|
|
|
// use the access latency as rule how fast we can access the server
|
|
|
|
|
waiting = Math.max(waiting, (int) (host.average() * latencyFactor));
|
|
|
|
|
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f)));
|
|
|
|
|
|
|
|
|
|
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
|
|
|
|
|
if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > MaxSameHostInQueue) waiting += 5000;
|
|
|
|
|
if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000;
|
|
|
|
|
|
|
|
|
|
// the time since last access to the domain is the basis of the remaining calculation
|
|
|
|
|
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
|
|
|
|
@ -252,11 +244,11 @@ public class Latency {
|
|
|
|
|
// this applies also to localhost, but differently, because it is not necessary to
|
|
|
|
|
// consider so many external accesses
|
|
|
|
|
s.append(", host.average = ").append(host.average());
|
|
|
|
|
waiting = Math.max(waiting, (int) (host.average() * latencyFactor));
|
|
|
|
|
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f)));
|
|
|
|
|
|
|
|
|
|
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
|
|
|
|
|
int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost());
|
|
|
|
|
if (hostcount > MaxSameHostInQueue) {
|
|
|
|
|
if (hostcount > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) {
|
|
|
|
|
s.append(", hostcount = ").append(hostcount);
|
|
|
|
|
waiting += 5000;
|
|
|
|
|
}
|
|
|
|
|