From 8952153ecfbc368101d4856a8006b7227bff4fab Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 28 Oct 2012 13:24:49 +0100 Subject: [PATCH] update to Balancer algorithm: - create a load list from the current list of known hosts - do not create this list for each Balancer.pop access - create the list from those hosts which have a zero-waiting time - select 1/3 from that list which have the most urls waiting - get hosts from the wainting list in random order - fixes for some delta-time computations - always load all urls from hosts which have never been loaded before --- htroot/IndexCreateQueues_p.java | 2 +- source/net/yacy/crawler/Balancer.java | 227 +++++++++++------- source/net/yacy/crawler/data/Latency.java | 133 +++++----- source/net/yacy/crawler/data/NoticedURL.java | 32 +-- .../net/yacy/crawler/retrieval/FTPLoader.java | 2 +- .../yacy/crawler/retrieval/HTTPLoader.java | 2 +- 6 files changed, 202 insertions(+), 196 deletions(-) diff --git a/htroot/IndexCreateQueues_p.java b/htroot/IndexCreateQueues_p.java index d3fd4c7ca..025245129 100644 --- a/htroot/IndexCreateQueues_p.java +++ b/htroot/IndexCreateQueues_p.java @@ -130,7 +130,7 @@ public class IndexCreateQueues_p { prop.put("crawler_host_" + hc + "_urlsPerHost", urlsPerHost); prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name()); prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]); - prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1]); + prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1] == Integer.MIN_VALUE ? "not accessed" : Integer.toString(host.getValue()[1])); List domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost); Seed initiator; diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index dee28ce74..e1d5b142d 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -28,11 +28,12 @@ package net.yacy.crawler; import java.io.File; import java.io.IOException; +import java.util.AbstractMap; import java.util.ArrayList; -import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; @@ -44,6 +45,7 @@ import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.protocol.Domains; +import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.Cache; @@ -78,6 +80,8 @@ public class Balancer { private final HandleSet double_push_check; // for debugging private long lastDomainStackFill; private int domStackInitSize; + private final List> zeroWaitingCandidates; + private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting public Balancer( final File cachePath, @@ -94,7 +98,9 @@ public class Balancer { this.myAgentIDs = myAgentIDs; this.domStackInitSize = Integer.MAX_VALUE; this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); - + this.zeroWaitingCandidates = new ArrayList>(); + this.random = new Random(System.currentTimeMillis()); + // create a stack for newly entered entries if (!(cachePath.exists())) cachePath.mkdir(); // make the path this.cacheStacksPath.mkdirs(); @@ -205,6 +211,12 @@ public class Balancer { for (final byte[] handle: urlHashes) stack.remove(handle); if (stack.isEmpty()) q.remove(); } + + // iterate through zero-waiting map + final Iterator> i = this.zeroWaitingCandidates.iterator(); + while (i.hasNext()) { + if (urlHashes.has(i.next().getValue())) i.remove(); + } return removedCounter; } @@ -274,32 +286,35 @@ public class Balancer { public Map getDomainStackHosts() { Map map = new TreeMap(); // we use a tree map to get a stable ordering for (Map.Entry entry: this.domainStacks.entrySet()) { - map.put(entry.getKey(), new Integer[]{entry.getValue().size(), (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)}); + int size = entry.getValue().size(); + int delta = (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); + map.put(entry.getKey(), new Integer[]{size, delta}); } return map; } /** - * compute the current sleep time for a given crawl entry - * @param cs - * @param crawlEntry - * @return + * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access + * The time can be as low as Long.MIN_VALUE to show that there should not be any limitation at all. + * @param robots + * @param profileEntry + * @param crawlURL + * @return the sleep time in milliseconds; may be negative for no sleep time */ - public long getDomainSleepTime(final CrawlSwitchboard cs, final RobotsTxt robots, Request crawlEntry) { - final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); - return getDomainSleepTime(robots, profileEntry, crawlEntry.url()); - } - private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) { - if (profileEntry == null) { - return 0; - } + if (profileEntry == null) return 0; long sleeptime = ( profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY || (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash())) - ) ? 0 : Latency.waitingRemaining(crawlURL, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server + ) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server return sleeptime; } + + private long getRobotsTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) { + if (profileEntry == null) return 0; + long sleeptime = Latency.waitingRobots(crawlURL, robots, this.myAgentIDs); // this uses the robots.txt database and may cause a loading of robots.txt from the server + return sleeptime < 0 ? 0 : sleeptime; + } /** * get lists of crawl request entries for a specific host @@ -377,6 +392,7 @@ public class Balancer { long sleeptime = 0; Request crawlEntry = null; + CrawlProfile profileEntry = null; synchronized (this) { byte[] failhash = null; while (!this.urlFileIndex.isEmpty()) { @@ -408,7 +424,7 @@ public class Balancer { // at this point we must check if the crawlEntry has relevance because the crawl profile still exists // if not: return null. A calling method must handle the null value and try again - final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); + profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); if (profileEntry == null) { Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); return null; @@ -425,6 +441,8 @@ public class Balancer { } if (crawlEntry == null) return null; + long robotsTime = getRobotsTime(robots, profileEntry, crawlEntry.url()); + Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime); if (delay && sleeptime > 0) { // force a busy waiting here // in best case, this should never happen if the balancer works propertly @@ -442,96 +460,119 @@ public class Balancer { Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); try {this.wait(1000); } catch (final InterruptedException e) {} } + Latency.updateAfterSelection(crawlEntry.url(), robotsTime); } - Latency.update(crawlEntry.url()); return crawlEntry; } private byte[] getbest(final RobotsTxt robots) { - // check if we need to get entries from the file index - try { - fillDomainStacks(); - } catch (final IOException e) { - Log.logException(e); - } - - // iterate over the domain stacks - final Iterator> i = this.domainStacks.entrySet().iterator(); - Map.Entry entry; - long smallestWaiting = Long.MAX_VALUE; - byte[] besturlhash = null; - String besthost = null; - Map zeroWaitingCandidates = new HashMap(); - while (i.hasNext() && zeroWaitingCandidates.size() < 10) { - entry = i.next(); - - // clean up empty entries - if (entry.getValue().isEmpty()) { - i.remove(); - continue; + synchronized (this.zeroWaitingCandidates) { + if (this.zeroWaitingCandidates.size() > 0) { + byte[] urlhash = pickFromZeroWaiting(); + if (urlhash != null) return urlhash; } - - final byte[] n = entry.getValue().removeOne(); - if (n == null) continue; - - long w; - Row.Entry rowEntry; - try { - rowEntry = this.urlFileIndex.get(n, false); - if (rowEntry == null) { - continue; + this.zeroWaitingCandidates.clear(); + + // check if we need to get entries from the file index + try { + fillDomainStacks(); + } catch (final IOException e) { + Log.logException(e); + } + + // iterate over the domain stacks + final Iterator> i = this.domainStacks.entrySet().iterator(); + Map.Entry entry; + long smallestWaiting = Long.MAX_VALUE; + byte[] besturlhash = null; + String besthost = null; + OrderedScoreMap> nextZeroCandidates = new OrderedScoreMap>(null); + while (i.hasNext() && nextZeroCandidates.size() < 1000) { + entry = i.next(); + + // clean up empty entries + if (entry.getValue().isEmpty()) { + i.remove(); + continue; + } + + final byte[] urlhash = entry.getValue().getOne(0); + if (urlhash == null) continue; + + long w; + Row.Entry rowEntry; + try { + rowEntry = this.urlFileIndex.get(urlhash, false); + if (rowEntry == null) { + continue; + } + Request crawlEntry = new Request(rowEntry); + w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); + //System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)); + //System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta)); + } catch (IOException e1) { + w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); } - Request crawlEntry = new Request(rowEntry); - w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); - //System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)); - //System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta)); - } catch (IOException e1) { - w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); - } - if (w < smallestWaiting) { - smallestWaiting = w; - besturlhash = n; - besthost = entry.getKey(); if (w <= 0) { - zeroWaitingCandidates.put(besthost, besturlhash); + nextZeroCandidates.set(new AbstractMap.SimpleEntry(entry.getKey(), urlhash), w == Integer.MIN_VALUE ? 1000 /* get new domains a chance */ : entry.getValue().size()); } - } - try { - entry.getValue().put(n); // put entry back, we are checking only - } catch (SpaceExceededException e) { - e.printStackTrace(); - } - } + if (w < smallestWaiting || (w == smallestWaiting && this.random.nextBoolean())) { + smallestWaiting = w; + besturlhash = urlhash; + besthost = entry.getKey(); + } + } + Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size()); + + if (besturlhash == null) { + Log.logInfo("Balancer", "*** getbest: besturlhash == null"); + return null; // this should never happen + } + + // best case would be, if we have some zeroWaitingCandidates, + // then we select that one with the largest stack + + if (nextZeroCandidates.isEmpty()) { + // bad luck: just take that one with least waiting + removeHashFromDomainStacks(besthost, besturlhash); + Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost); + return besturlhash; + } + + // now take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates + int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3); + Iterator> k = nextZeroCandidates.keys(false); + while (k.hasNext() && pick-- > 0) { + this.zeroWaitingCandidates.add(k.next()); + } + Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size()); + + return pickFromZeroWaiting(); + } + } - if (besturlhash == null) return null; // worst case - - // best case would be, if we have some zeroWaitingCandidates, - // then we select that one with the largest stack - if (!zeroWaitingCandidates.isEmpty()) { - int largestStack = -1; - String largestStackHost = null; - byte[] largestStackHash = null; - for (Map.Entry z: zeroWaitingCandidates.entrySet()) { - HandleSet hs = this.domainStacks.get(z.getKey()); - if (hs == null || hs.size() <= largestStack) continue; - largestStack = hs.size(); - largestStackHost = z.getKey(); - largestStackHash = z.getValue(); - } - if (largestStackHost != null && largestStackHash != null) { - removeHashFromDomainStacks(largestStackHost, largestStackHash); - //Log.logInfo("Balancer", "*** picked one from largest stack"); - return largestStackHash; - } - } + private byte[] pickFromZeroWaiting() { + // by random we choose now either from the largest stack or from any of the other stacks + String host = null; + byte[] hash = null; + while (this.zeroWaitingCandidates.size() > 0) { + Map.Entry z = this.zeroWaitingCandidates.remove(this.random.nextInt(this.zeroWaitingCandidates.size())); + HandleSet hs = this.domainStacks.get(z.getKey()); + if (hs == null) continue; + host = z.getKey(); if (host == null) continue; + hash = z.getValue(); if (hash == null) continue; + removeHashFromDomainStacks(host, hash); + Log.logInfo("Balancer", "*** getbest: picked a random from the zero-waiting stack: " + host + ", zeroWaitingCandidates.size = " + this.zeroWaitingCandidates.size()); + return hash; + } - // default case: just take that one with least waiting - removeHashFromDomainStacks(besthost, besturlhash); - return besturlhash; + Log.logInfo("Balancer", "*** getbest: picking from zero-waiting stack failed!" + " zeroWaitingCandidates.size = " + this.zeroWaitingCandidates.size()); + this.zeroWaitingCandidates.clear(); + return null; } - + private void fillDomainStacks() throws IOException { if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return; this.domainStacks.clear(); diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index 198b2448f..113ee976b 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -43,7 +43,12 @@ public class Latency { // the map is a mapping from host names to host configurations private static final ConcurrentHashMap map = new ConcurrentHashMap(); - public static void update(final MultiProtocolURI url, final long time) { + /** + * update the latency entry after a host was accessed to load a file + * @param url + * @param time the time to load the file in milliseconds + */ + public static void updateAfterLoad(final MultiProtocolURI url, final long time) { final String host = url.getHost(); if (host == null) return; Host h = map.get(host); @@ -56,12 +61,17 @@ public class Latency { } } - public static void update(final MultiProtocolURI url) { + /** + * update the latency entry after a host was selected for queueing into the loader + * @param url + * @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist + */ + public static void updateAfterSelection(final MultiProtocolURI url, final long robotsCrawlDelay) { final String host = url.getHost(); if (host == null) return; Host h = map.get(host); if (h == null) { - h = new Host(host, DEFAULT_AVERAGE); + h = new Host(host, DEFAULT_AVERAGE, robotsCrawlDelay); if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear(); map.put(host, h); } else { @@ -69,51 +79,38 @@ public class Latency { } } - public static void slowdown(final MultiProtocolURI url) { - final String host = url.getHost(); - if (host == null) return; - Host h = map.get(host); - if (h == null) { - h = new Host(host, DEFAULT_AVERAGE); - if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear(); - map.put(host, h); - } else { - h.slowdown(); - } - } - - public static Host host(final MultiProtocolURI url) { + private static Host host(final MultiProtocolURI url) { final String host = url.getHost(); if (host == null) return null; return map.get(host); } - public static int average(final MultiProtocolURI url) { - final String host = url.getHost(); - if (host == null) return 0; - final Host h = map.get(host); - if (h == null) return 0; - return h.average(); - } - public static Iterator> iterator() { return map.entrySet().iterator(); } - /** - * calculate the time since the last access of the domain as referenced by the url hash - * @param urlhash - * @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before + * Return the waiting time demanded by the robots.txt file of the target host. + * A special case is, if the remote host has a special crawl-delay assignment for + * this crawler with 0. This causes that a -1 is returned + * @param url + * @param robots + * @param thisAgents + * @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights */ - public static long lastAccessDelta(final MultiProtocolURI url) { - final Latency.Host host = Latency.host(url); - if (host == null) return Long.MAX_VALUE; // never accessed - return System.currentTimeMillis() - host.lastacc(); + public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents) { + long robotsDelay = 0; + RobotsTxtEntry robotsEntry; + try { + robotsEntry = robots.getEntry(url, thisAgents); + } catch (final IOException e) { + robotsEntry = null; + } + robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); + if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer + return robotsDelay; } - - /** * guess a minimum waiting time * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low @@ -125,11 +122,11 @@ public class Latency { * which expresses how long the time is over the minimum waiting time. */ public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) { - if (hostname == null) return Long.MIN_VALUE; + if (hostname == null) return Integer.MIN_VALUE; // first check if the domain was _ever_ accessed before final Host host = map.get(hostname); - if (host == null) return Long.MIN_VALUE; // no delay if host is new + if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere // find the minimum waiting time based on the network domain (local or global) final boolean local = Domains.isLocal(hostname, null); @@ -139,14 +136,15 @@ public class Latency { // if we have accessed the domain many times, get slower (the flux factor) waiting += host.flux(waiting); + // the time since last access to the domain is the basis of the remaining calculation + final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); + // use the access latency as rule how fast we can access the server // this applies also to localhost, but differently, because it is not necessary to // consider so many external accesses waiting = Math.max(waiting, host.average() * 2); - // the time since last access to the domain is the basis of the remaining calculation - final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); - return Math.max(0, Math.min(60000, waiting) - timeSinceLastAccess); + return Math.min(60000, waiting) - timeSinceLastAccess; } /** @@ -159,13 +157,13 @@ public class Latency { * - and a given minimum access time as given in robots.txt * @param minimumLocalDelta * @param minimumGlobalDelta - * @return the remaining waiting time in milliseconds + * @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time */ public static long waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) { // first check if the domain was _ever_ accessed before final Host host = host(url); - if (host == null) return Long.MIN_VALUE; // no delay if host is new + if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere // find the minimum waiting time based on the network domain (local or global) final boolean local = url.isLocal(); @@ -185,22 +183,15 @@ public class Latency { // consider so many external accesses waiting = Math.max(waiting, host.average() * 2); + // the time since last access to the domain is the basis of the remaining calculation + final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); + // find the delay as given by robots.txt on target site - long robotsDelay = 0; - RobotsTxtEntry robotsEntry; - try { - robotsEntry = robots.getEntry(url, thisAgents); - } catch (final IOException e) { - robotsEntry = null; - } - robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); - if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return 0; // no limits if granted exclusively for this peer + long robotsDelay = waitingRobots(url, robots, thisAgents); + if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer waiting = Math.max(waiting, robotsDelay); - - // the time since last access to the domain is the basis of the remaining calculation - final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); - return Math.max(0, Math.min(60000, waiting) - timeSinceLastAccess); + return Math.min(60000, waiting) - timeSinceLastAccess; } @@ -235,15 +226,8 @@ public class Latency { waiting = Math.max(waiting, host.average() * 2); // find the delay as given by robots.txt on target site - long robotsDelay = 0; - RobotsTxtEntry robotsEntry; - try { - robotsEntry = robots.getEntry(url, thisAgents); - } catch (final IOException e) { - robotsEntry = null; - } - robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); - if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer + long robotsDelay = waitingRobots(url, robots, thisAgents); + if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer waiting = Math.max(waiting, robotsDelay); s.append(", robots.delay = ").append(robotsDelay); @@ -262,26 +246,24 @@ public class Latency { private int count; private final String host; private long robotsMinDelay; - public Host(final String host, final long time) { + private Host(final String host, final long time) { + this(host, time, 0); + } + private Host(final String host, final long time, long robotsMinDelay) { this.host = host; this.timeacc = time; this.count = 1; this.lastacc = System.currentTimeMillis(); - this.robotsMinDelay = 0; + this.robotsMinDelay = robotsMinDelay; } - public void update(final long time) { + private void update(final long time) { this.lastacc = System.currentTimeMillis(); this.timeacc += Math.min(30000, time); this.count++; } - public void update() { + private void update() { this.lastacc = System.currentTimeMillis(); } - public void slowdown() { - this.lastacc = System.currentTimeMillis(); - this.timeacc = Math.min(60000, average() * 2); - this.count = 1; - } public int count() { return this.count; } @@ -294,14 +276,11 @@ public class Latency { public String host() { return this.host; } - public void robotsDelay(final long ur) { - this.robotsMinDelay = ur; - } public long robotsDelay() { return this.robotsMinDelay; } public long flux(final long range) { - return this.count >= 1000 ? range * Math.min(5000, this.count) / 1000 : range / (1000 - this.count); + return this.count >= 10000 ? range * Math.min(5000, this.count) / 10000 : range / (10000 - this.count); } } diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index 592619d92..2c76050b9 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -51,15 +51,15 @@ public class NoticedURL { LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD; } - public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain - public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain + private static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain + public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain private Balancer coreStack; // links found by crawling to depth-1 private Balancer limitStack; // links found by crawling at target depth private Balancer remoteStack; // links from remote crawl orders private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry - public NoticedURL( + protected NoticedURL( final File cachePath, final Set myAgentIDs, final boolean useTailCache, @@ -87,7 +87,7 @@ public class NoticedURL { this.noloadStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); } - public void clear() { + protected void clear() { Log.logInfo("NoticedURL", "CLEARING ALL STACKS"); this.coreStack.clear(); this.limitStack.clear(); @@ -95,7 +95,7 @@ public class NoticedURL { this.noloadStack.clear(); } - public synchronized void close() { + protected synchronized void close() { Log.logInfo("NoticedURL", "CLOSING ALL STACKS"); if (this.coreStack != null) { this.coreStack.close(); @@ -158,7 +158,7 @@ public class NoticedURL { } } - public boolean existsInStack(final byte[] urlhashb) { + protected boolean existsInStack(final byte[] urlhashb) { return this.coreStack.has(urlhashb) || this.limitStack.has(urlhashb) || @@ -193,7 +193,7 @@ public class NoticedURL { } } - public Request get(final byte[] urlhash) { + protected Request get(final byte[] urlhash) { Request entry = null; try {if ((entry = this.noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = this.coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} @@ -235,7 +235,7 @@ public class NoticedURL { /** * get a list of domains that are currently maintained as domain stacks - * @return a map of clear text strings of host names to the size of the domain stacks + * @return a map of clear text strings of host names to two integers: the size of the domain stacks and the access delta time */ public Map getDomainStackHosts(final StackType stackType) { switch (stackType) { @@ -247,20 +247,6 @@ public class NoticedURL { } } - /** - * get a list of domains that are currently maintained as domain stacks - * @return a collection of clear text strings of host names - */ - public long getDomainSleepTime(final StackType stackType, final RobotsTxt robots, final CrawlSwitchboard cs, Request crawlEntry) { - switch (stackType) { - case LOCAL: return this.coreStack.getDomainSleepTime(cs, robots, crawlEntry); - case GLOBAL: return this.limitStack.getDomainSleepTime(cs, robots, crawlEntry); - case REMOTE: return this.remoteStack.getDomainSleepTime(cs, robots, crawlEntry); - case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, robots, crawlEntry); - default: return 0; - } - } - /** * get lists of crawl request entries for a specific host * @param host @@ -287,7 +273,7 @@ public class NoticedURL { } } - public void shift(final StackType fromStack, final StackType toStack, final CrawlSwitchboard cs, final RobotsTxt robots) { + protected void shift(final StackType fromStack, final StackType toStack, final CrawlSwitchboard cs, final RobotsTxt robots) { try { final Request entry = pop(fromStack, false, cs, robots); if (entry != null) { diff --git a/source/net/yacy/crawler/retrieval/FTPLoader.java b/source/net/yacy/crawler/retrieval/FTPLoader.java index 6e292d462..93fb0ee2c 100644 --- a/source/net/yacy/crawler/retrieval/FTPLoader.java +++ b/source/net/yacy/crawler/retrieval/FTPLoader.java @@ -158,7 +158,7 @@ public class FTPLoader { throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail); } - Latency.update(request.url(), System.currentTimeMillis() - start); + Latency.updateAfterLoad(request.url(), System.currentTimeMillis() - start); return response; } diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index bad4a153b..7846e1533 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -73,7 +73,7 @@ public final class HTTPLoader { public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType) throws IOException { final long start = System.currentTimeMillis(); final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType); - Latency.update(entry.url(), System.currentTimeMillis() - start); + Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); return doc; }