From 77531850b5087db37ec98b199962539e0b937db8 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 21 Jan 2014 16:05:55 +0100 Subject: [PATCH] reverted crawling strategy from latest commit. --- defaults/yacy.init | 10 +++ htroot/PerformanceQueues_p.java | 2 +- htroot/api/latency_p.java | 7 +- source/net/yacy/crawler/data/CrawlQueues.java | 25 ++++++- source/net/yacy/crawler/data/Latency.java | 73 ++++++++++++------- source/net/yacy/search/Switchboard.java | 4 + .../net/yacy/search/SwitchboardConstants.java | 1 + 7 files changed, 90 insertions(+), 32 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 393fcaf48..f24c158ee 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -741,6 +741,16 @@ crawler.file.maxFileSize=100000000 # maximum number of crawler threads crawler.MaxActiveThreads = 200 +# maximum number of same hosts in crawler threads +crawler.MaxSameHostInQueue = 20 + +# default latency is the start value of the average of remote server response time +crawler.defaultAverageLatency = 500 + +# the latency factor is a factor that is applied to the average remote server latency. +# The result is the minimum remote server access delay time +crawler.latencyFactor = 0.5 + # maximum size of indexing queue indexer.slots = 100 diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index e78f501c3..58d555029 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -306,7 +306,7 @@ public class PerformanceQueues_p { // table thread pool settings prop.put("pool_0_name","Crawler Pool"); - prop.put("pool_0_maxActive", sb.getConfigLong("crawler.MaxActiveThreads", 0)); + prop.put("pool_0_maxActive", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 0)); prop.put("pool_0_numActive",sb.crawlQueues.workerSize()); final YaCyHttpServer httpd = sb.getHttpServer(); diff --git a/htroot/api/latency_p.java b/htroot/api/latency_p.java index cf308580f..7636e6332 100644 --- a/htroot/api/latency_p.java +++ b/htroot/api/latency_p.java @@ -26,6 +26,7 @@ import java.util.Iterator; import java.util.Map; import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.Latency; import net.yacy.crawler.data.Latency.Host; @@ -34,7 +35,7 @@ import net.yacy.server.serverSwitch; public class latency_p { - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) { + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) { final serverObjects prop = new serverObjects(); //final plasmaSwitchboard sb = (plasmaSwitchboard) env; @@ -42,7 +43,7 @@ public class latency_p { Map.Entry e; int c = 0; Latency.Host host; - //ClientIdentification.Agent agent = post == null ? ClientIdentification.yacyInternetCrawlerAgent : ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); + ClientIdentification.Agent agent = post == null ? ClientIdentification.yacyInternetCrawlerAgent : ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); while (i.hasNext()) { e = i.next(); host = e.getValue(); @@ -52,7 +53,7 @@ public class latency_p { prop.put("domains_" + c + "_count", host.count()); prop.put("domains_" + c + "_average", host.average()); prop.put("domains_" + c + "_robots", host.robotsDelay()); - prop.put("domains_" + c + "_flux", 0); + prop.put("domains_" + c + "_flux", host.flux(agent.minimumDelta)); c++; } prop.put("domains", c); diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 17f800402..3baf9d61a 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -144,6 +144,27 @@ public class CrawlQueues { } return null; } + + /** + * count the number of same host names in the worker + * @param host + * @return + */ + public int hostcount(final String host) { + if (host == null || host.length() == 0) return 0; + int c = 0; + final int timeout = (int) this.sb.getConfigLong("crawler.clientTimeout", 10000); + for (final Loader worker: this.workers.values()) { + if (worker.isAlive()) { + if (worker.age() > timeout) { + try {worker.interrupt();} catch (Throwable e) {} + } else if (host.equals(worker.request.url().getHost())) { + c++; + } + } + } + return c; + } public void removeURL(final byte[] hash) { assert hash != null && hash.length == 12; @@ -180,8 +201,8 @@ public class CrawlQueues { // wait for all workers to finish final int timeout = (int) this.sb.getConfigLong("crawler.clientTimeout", 10000); for (final Loader w: this.workers.values()) { - if (w.age() > timeout) { - w.interrupt(); + if (w.isAlive() && w.age() > timeout) { + try {w.interrupt();} catch (Throwable e) {} } } } diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index ec25c93e7..e9cd84b8a 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -36,15 +36,22 @@ import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxtEntry; import net.yacy.kelondro.util.MemoryControl; +import net.yacy.search.Switchboard; public class Latency { - private final static int DEFAULT_AVERAGE = 300; + private final static int DEFAULT_AVERAGE_LATENCY = 500; + private final static int DEFAULT_MAX_SAME_HOST_IN_QUEUE = 20; + private final static float DEFAULT_LATENCY_FACTOR = 0.5f; // the map is a mapping from host names to host configurations private static final int mapMaxSize = 1000; private static final ConcurrentHashMap map = new ConcurrentHashMap(); + + public static int defaultAverageLatency = DEFAULT_AVERAGE_LATENCY; + public static int MaxSameHostInQueue = DEFAULT_MAX_SAME_HOST_IN_QUEUE; + public static float latencyFactor = DEFAULT_LATENCY_FACTOR; /** * update the latency entry after a host was selected for queueing into the loader @@ -57,7 +64,7 @@ public class Latency { String hosthash = url.hosthash(); Host h = map.get(hosthash); if (h == null) { - h = new Host(host, DEFAULT_AVERAGE, robotsCrawlDelay); + h = new Host(host, defaultAverageLatency, robotsCrawlDelay); if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear(); map.put(hosthash, h); } @@ -139,7 +146,6 @@ public class Latency { /** * guess a minimum waiting time * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low - * also the 'isCGI' property is missing, because the full text of the domain is unknown here * @param hostname * @param hosthash * @param robots @@ -156,12 +162,16 @@ public class Latency { // find the minimum waiting time based on the network domain (local or global) int waiting = agent.minimumDelta; - if (agent.minimumDelta > ClientIdentification.minimumLocalDeltaInit) { - // use the access latency as rule how fast we can access the server - // this applies also to localhost, but differently, because it is not necessary to - // consider so many external accesses - waiting = Math.max(waiting * 3 / 2, host.average() / 2); - } + // if we have accessed the domain many times, get slower (the flux factor) + waiting += host.flux(waiting); + + // use the access latency as rule how fast we can access the server + // this applies also to localhost, but differently, because it is not necessary to + // consider so many external accesses + waiting = Math.max(waiting, (int) (host.average() * latencyFactor)); + + // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting + if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > MaxSameHostInQueue) waiting += 5000; // the time since last access to the domain is the basis of the remaining calculation final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); @@ -197,17 +207,14 @@ public class Latency { boolean local = url.isLocal(); int waiting = agent.minimumDelta; - if (!local && agent.minimumDelta > ClientIdentification.minimumLocalDeltaInit) { - // for CGI accesses, we double the minimum time - // mostly there is a database access in the background - // which creates a lot of unwanted IO on target site - if (MultiProtocolURL.isCGI(url.getFileName())) { - waiting = waiting * 3 / 2; - } else { - // use the access latency as rule how fast we can access the server - waiting = Math.max(waiting, host.average() / 2); - } - } + // if we have accessed the domain many times, get slower (the flux factor) + if (!local) waiting += host.flux(waiting); + + // use the access latency as rule how fast we can access the server + waiting = Math.max(waiting, (int) (host.average() * latencyFactor)); + + // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting + if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > MaxSameHostInQueue) waiting += 5000; // the time since last access to the domain is the basis of the remaining calculation final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); @@ -226,22 +233,33 @@ public class Latency { final Host host = host(url); if (host == null) return "host " + host + " never accessed before -> Integer.MIN_VALUE"; // no delay if host is new + // find the minimum waiting time based on the network domain (local or global) + boolean local = url.isLocal(); final StringBuilder s = new StringBuilder(50); // find the minimum waiting time based on the network domain (local or global) int waiting = agent.minimumDelta; s.append("minimumDelta = ").append(waiting); - // for CGI accesses, we double the minimum time - // mostly there is a database access in the background - // which creates a lot of unwanted IO on target site - if (MultiProtocolURL.isCGI(url.getFileName())) { waiting = waiting * 2; s.append(", isCGI = true -> double"); } - + // if we have accessed the domain many times, get slower (the flux factor) + if (!local) { + int flux = host.flux(waiting); + waiting += flux; + s.append(", flux = ").append(flux); + } + // use the access latency as rule how fast we can access the server // this applies also to localhost, but differently, because it is not necessary to // consider so many external accesses s.append(", host.average = ").append(host.average()); - waiting = Math.max(waiting, host.average() * 3 / 2); + waiting = Math.max(waiting, (int) (host.average() * latencyFactor)); + + // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting + int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()); + if (hostcount > MaxSameHostInQueue) { + s.append(", hostcount = ").append(hostcount); + waiting += 5000; + } // find the delay as given by robots.txt on target site int robotsDelay = waitingRobots(url, robots, agent); @@ -332,6 +350,9 @@ public class Latency { public long robotsDelay() { return this.robotsMinDelay; } + public int flux(final int range) { + return this.count.get() >= 10000 ? range * Math.min(5000, this.count.get()) / 10000 : range / (10000 - this.count.get()); + } } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 5aa4fb3eb..e05ca0fe0 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -126,6 +126,7 @@ import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlQueues; +import net.yacy.crawler.data.Latency; import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.ResultImages; import net.yacy.crawler.data.ResultURLs; @@ -824,6 +825,9 @@ public final class Switchboard extends serverSwitch { getDataPath()); OAIListFriendsLoader.init(this.loader, oaiFriends, ClientIdentification.yacyInternetCrawlerAgent); this.crawlQueues = new CrawlQueues(this, this.queuesRoot); + Latency.defaultAverageLatency = this.getConfigInt("crawler.defaultAverageLatency", 500); + Latency.latencyFactor = this.getConfigFloat("crawler.latencyFactor", 0.5f); + Latency.MaxSameHostInQueue = this.getConfigInt("crawler.MaxSameHostInQueue", 20); // on startup, resume all crawls setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused", "false"); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 5f0704d3e..cc1630879 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -332,6 +332,7 @@ public final class SwitchboardConstants { *

Name of the setting how many active crawler-threads may maximal be running on the same time

*/ public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; + public static final String CRAWLER_SAME_HOST_IN_QUEUE = "crawler.MaxSameHostInQueue"; public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store