new crawling factors can now be changed during runtime

pull/1/head
Michael Peter Christen 11 years ago
parent be5e808236
commit 0168f80c28

@ -41,18 +41,10 @@ import net.yacy.search.Switchboard;
public class Latency { public class Latency {
private final static int DEFAULT_AVERAGE_LATENCY = 500;
private final static int DEFAULT_MAX_SAME_HOST_IN_QUEUE = 20;
private final static float DEFAULT_LATENCY_FACTOR = 0.5f;
// the map is a mapping from host names to host configurations // the map is a mapping from host names to host configurations
private static final int mapMaxSize = 1000; private static final int mapMaxSize = 1000;
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>(); private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
public static int defaultAverageLatency = DEFAULT_AVERAGE_LATENCY;
public static int MaxSameHostInQueue = DEFAULT_MAX_SAME_HOST_IN_QUEUE;
public static float latencyFactor = DEFAULT_LATENCY_FACTOR;
/** /**
* update the latency entry after a host was selected for queueing into the loader * update the latency entry after a host was selected for queueing into the loader
* @param url * @param url
@ -64,7 +56,7 @@ public class Latency {
String hosthash = url.hosthash(); String hosthash = url.hosthash();
Host h = map.get(hosthash); Host h = map.get(hosthash);
if (h == null) { if (h == null) {
h = new Host(host, defaultAverageLatency, robotsCrawlDelay); h = new Host(host, Switchboard.getSwitchboard().getConfigInt("crawler.defaultAverageLatency", 500), robotsCrawlDelay);
if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear(); if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear();
map.put(hosthash, h); map.put(hosthash, h);
} }
@ -168,10 +160,10 @@ public class Latency {
// use the access latency as rule how fast we can access the server // use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to // this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses // consider so many external accesses
waiting = Math.max(waiting, (int) (host.average() * latencyFactor)); waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f)));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > MaxSameHostInQueue) waiting += 5000; if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000;
// the time since last access to the domain is the basis of the remaining calculation // the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
@ -211,10 +203,10 @@ public class Latency {
if (!local) waiting += host.flux(waiting); if (!local) waiting += host.flux(waiting);
// use the access latency as rule how fast we can access the server // use the access latency as rule how fast we can access the server
waiting = Math.max(waiting, (int) (host.average() * latencyFactor)); waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f)));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > MaxSameHostInQueue) waiting += 5000; if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000;
// the time since last access to the domain is the basis of the remaining calculation // the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
@ -252,11 +244,11 @@ public class Latency {
// this applies also to localhost, but differently, because it is not necessary to // this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses // consider so many external accesses
s.append(", host.average = ").append(host.average()); s.append(", host.average = ").append(host.average());
waiting = Math.max(waiting, (int) (host.average() * latencyFactor)); waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f)));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()); int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost());
if (hostcount > MaxSameHostInQueue) { if (hostcount > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) {
s.append(", hostcount = ").append(hostcount); s.append(", hostcount = ").append(hostcount);
waiting += 5000; waiting += 5000;
} }

@ -825,9 +825,6 @@ public final class Switchboard extends serverSwitch {
getDataPath()); getDataPath());
OAIListFriendsLoader.init(this.loader, oaiFriends, ClientIdentification.yacyInternetCrawlerAgent); OAIListFriendsLoader.init(this.loader, oaiFriends, ClientIdentification.yacyInternetCrawlerAgent);
this.crawlQueues = new CrawlQueues(this, this.queuesRoot); this.crawlQueues = new CrawlQueues(this, this.queuesRoot);
Latency.defaultAverageLatency = this.getConfigInt("crawler.defaultAverageLatency", 500);
Latency.latencyFactor = this.getConfigFloat("crawler.latencyFactor", 0.5f);
Latency.MaxSameHostInQueue = this.getConfigInt("crawler.MaxSameHostInQueue", 20);
// on startup, resume all crawls // on startup, resume all crawls
setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused", "false"); setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused", "false");

Loading…
Cancel
Save