From 0168f80c283345e8e78cfb8b3576a35d4f8cf93b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 21 Jan 2014 17:52:16 +0100 Subject: [PATCH] new crawling factors can now be changed during runtime --- source/net/yacy/crawler/data/Latency.java | 22 +++++++--------------- source/net/yacy/search/Switchboard.java | 3 --- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index e9cd84b8a..c99c06c98 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -41,17 +41,9 @@ import net.yacy.search.Switchboard; public class Latency { - private final static int DEFAULT_AVERAGE_LATENCY = 500; - private final static int DEFAULT_MAX_SAME_HOST_IN_QUEUE = 20; - private final static float DEFAULT_LATENCY_FACTOR = 0.5f; - // the map is a mapping from host names to host configurations private static final int mapMaxSize = 1000; private static final ConcurrentHashMap map = new ConcurrentHashMap(); - - public static int defaultAverageLatency = DEFAULT_AVERAGE_LATENCY; - public static int MaxSameHostInQueue = DEFAULT_MAX_SAME_HOST_IN_QUEUE; - public static float latencyFactor = DEFAULT_LATENCY_FACTOR; /** * update the latency entry after a host was selected for queueing into the loader @@ -64,7 +56,7 @@ public class Latency { String hosthash = url.hosthash(); Host h = map.get(hosthash); if (h == null) { - h = new Host(host, defaultAverageLatency, robotsCrawlDelay); + h = new Host(host, Switchboard.getSwitchboard().getConfigInt("crawler.defaultAverageLatency", 500), robotsCrawlDelay); if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear(); map.put(hosthash, h); } @@ -168,10 +160,10 @@ public class Latency { // use the access latency as rule how fast we can access the server // this applies also to localhost, but differently, because it is not necessary to // consider so many external accesses - waiting = Math.max(waiting, (int) (host.average() * latencyFactor)); + waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f))); // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting - if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > MaxSameHostInQueue) waiting += 5000; + if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000; // the time since last access to the domain is the basis of the remaining calculation final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); @@ -211,10 +203,10 @@ public class Latency { if (!local) waiting += host.flux(waiting); // use the access latency as rule how fast we can access the server - waiting = Math.max(waiting, (int) (host.average() * latencyFactor)); + waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f))); // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting - if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > MaxSameHostInQueue) waiting += 5000; + if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000; // the time since last access to the domain is the basis of the remaining calculation final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); @@ -252,11 +244,11 @@ public class Latency { // this applies also to localhost, but differently, because it is not necessary to // consider so many external accesses s.append(", host.average = ").append(host.average()); - waiting = Math.max(waiting, (int) (host.average() * latencyFactor)); + waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f))); // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()); - if (hostcount > MaxSameHostInQueue) { + if (hostcount > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) { s.append(", hostcount = ").append(hostcount); waiting += 5000; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 734c8a114..b319fc3f0 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -825,9 +825,6 @@ public final class Switchboard extends serverSwitch { getDataPath()); OAIListFriendsLoader.init(this.loader, oaiFriends, ClientIdentification.yacyInternetCrawlerAgent); this.crawlQueues = new CrawlQueues(this, this.queuesRoot); - Latency.defaultAverageLatency = this.getConfigInt("crawler.defaultAverageLatency", 500); - Latency.latencyFactor = this.getConfigFloat("crawler.latencyFactor", 0.5f); - Latency.MaxSameHostInQueue = this.getConfigInt("crawler.MaxSameHostInQueue", 20); // on startup, resume all crawls setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused", "false");