From 0fe8be79814b4b0b7b77e8b0eebd3a0c2eac509c Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 30 Oct 2012 17:30:24 +0100 Subject: [PATCH] enhaced data structures for balancer and latency computation which should produce a bit better prognosis about forced waiting times. --- htroot/IndexCreateQueues_p.java | 2 +- source/net/yacy/crawler/Balancer.java | 147 ++++++++++-------- source/net/yacy/crawler/data/Latency.java | 59 ++++--- source/net/yacy/crawler/data/NoticedURL.java | 10 +- source/net/yacy/crawler/robots/RobotsTxt.java | 5 +- .../net/yacy/data/ymark/YMarkCrawlStart.java | 2 +- source/net/yacy/peers/RemoteSearch.java | 2 +- 7 files changed, 131 insertions(+), 96 deletions(-) diff --git a/htroot/IndexCreateQueues_p.java b/htroot/IndexCreateQueues_p.java index 025245129..13280211d 100644 --- a/htroot/IndexCreateQueues_p.java +++ b/htroot/IndexCreateQueues_p.java @@ -121,7 +121,7 @@ public class IndexCreateQueues_p { prop.put("crawler_embed_deletepattern", deletepattern); prop.put("crawler_embed_queuename", stackType.name()); - final Map hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType); + final Map hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType, sb.robots); int hc = 0; for (Map.Entry host: hosts.entrySet()) { diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index e54f455c1..8f9ad115f 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -76,13 +76,22 @@ public class Balancer { private BufferedObjectIndex urlFileIndex; // class variables computed during operation - private final ConcurrentMap domainStacks; // a map from host name to lists with url hashs + private final ConcurrentMap domainStacks; // a map from host name to lists with url hashs private final HandleSet double_push_check; // for debugging private long lastDomainStackFill; private int domStackInitSize; private final List> zeroWaitingCandidates; private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting + private static class HostHandles { + public String hosthash; + public HandleSet handleSet; + public HostHandles(final String hosthash, final HandleSet handleSet) { + this.hosthash = hosthash; + this.handleSet = handleSet; + } + } + public Balancer( final File cachePath, final String stackname, @@ -92,7 +101,7 @@ public class Balancer { final boolean useTailCache, final boolean exceed134217727) { this.cacheStacksPath = cachePath; - this.domainStacks = new ConcurrentHashMap(); + this.domainStacks = new ConcurrentHashMap(); this.minimumLocalDelta = minimumLocalDelta; this.minimumGlobalDelta = minimumGlobalDelta; this.myAgentIDs = myAgentIDs; @@ -204,10 +213,10 @@ public class Balancer { assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s; // iterate through the domain stacks - final Iterator> q = this.domainStacks.entrySet().iterator(); + final Iterator> q = this.domainStacks.entrySet().iterator(); HandleSet stack; while (q.hasNext()) { - stack = q.next().getValue(); + stack = q.next().getValue().handleSet; for (final byte[] handle: urlHashes) stack.remove(handle); if (stack.isEmpty()) q.remove(); } @@ -242,8 +251,8 @@ public class Balancer { private boolean domainStacksNotEmpty() { if (this.domainStacks == null) return false; synchronized (this.domainStacks) { - for (final HandleSet l: this.domainStacks.values()) { - if (!l.isEmpty()) return true; + for (final HostHandles l: this.domainStacks.values()) { + if (!l.handleSet.isEmpty()) return true; } } return false; @@ -285,11 +294,11 @@ public class Balancer { * get a list of domains that are currently maintained as domain stacks * @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time} */ - public Map getDomainStackHosts() { + public Map getDomainStackHosts(RobotsTxt robots) { Map map = new TreeMap(); // we use a tree map to get a stable ordering - for (Map.Entry entry: this.domainStacks.entrySet()) { - int size = entry.getValue().size(); - int delta = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); + for (Map.Entry entry: this.domainStacks.entrySet()) { + int size = entry.getValue().handleSet.size(); + int delta = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); map.put(entry.getKey(), new Integer[]{size, delta}); } return map; @@ -333,8 +342,10 @@ public class Balancer { * @return a list of crawl loader requests */ public List getDomainStackReferences(String host, int maxcount) { - HandleSet domainList = this.domainStacks.get(host); - if (domainList == null || domainList.isEmpty()) return new ArrayList(0); + HostHandles hh = this.domainStacks.get(host); + if (hh == null) return new ArrayList(0); + HandleSet domainList = hh.handleSet; + if (domainList.isEmpty()) return new ArrayList(0); ArrayList cel = new ArrayList(maxcount); for (int i = 0; i < maxcount; i++) { if (domainList.size() <= i) break; @@ -358,16 +369,17 @@ public class Balancer { return cel; } - private void pushHashToDomainStacks(String host, final byte[] urlhash) throws SpaceExceededException { + private void pushHashToDomainStacks(String host, String hosthash, final byte[] urlhash) throws SpaceExceededException { // extend domain stack if (host == null) host = Domains.LOCALHOST; - HandleSet domainList = this.domainStacks.get(host); - if (domainList == null) { + HostHandles hh = this.domainStacks.get(host); + if (hh == null) { // create new list - domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1); + HandleSet domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1); domainList.put(urlhash); - this.domainStacks.put(host, domainList); + this.domainStacks.put(host, new HostHandles(hosthash, domainList)); } else { + HandleSet domainList = hh.handleSet; // extend existent domain list domainList.put(urlhash); } @@ -376,11 +388,12 @@ public class Balancer { private void removeHashFromDomainStacks(String host, final byte[] urlhash) { // reduce domain stack if (host == null) host = Domains.LOCALHOST; - final HandleSet domainList = this.domainStacks.get(host); - if (domainList == null) { + HostHandles hh = this.domainStacks.get(host); + if (hh == null) { this.domainStacks.remove(host); return; } + HandleSet domainList = hh.handleSet; domainList.remove(urlhash); if (domainList.isEmpty()) this.domainStacks.remove(host); } @@ -495,26 +508,24 @@ public class Balancer { } // iterate over the domain stacks - final Iterator> i = this.domainStacks.entrySet().iterator(); - Map.Entry entry; - long smallestWaiting = Long.MAX_VALUE; - byte[] besturlhash = null; - String besthost = null; - OrderedScoreMap> nextZeroCandidates = new OrderedScoreMap>(null); - int newCandidatesForward = 10; + final Iterator> i = this.domainStacks.entrySet().iterator(); + Map.Entry entry; + OrderedScoreMap> nextZeroCandidates = new OrderedScoreMap>(null); + OrderedScoreMap> failoverCandidates = new OrderedScoreMap>(null); + int newCandidatesForward = 1; while (i.hasNext() && nextZeroCandidates.size() < 1000) { entry = i.next(); // clean up empty entries - if (entry.getValue().isEmpty()) { + if (entry.getValue().handleSet.isEmpty()) { i.remove(); continue; } - final byte[] urlhash = entry.getValue().getOne(0); + final byte[] urlhash = entry.getValue().handleSet.getOne(0); if (urlhash == null) continue; - long w; + int w; Row.Entry rowEntry; try { rowEntry = this.urlFileIndex.get(urlhash, false); @@ -526,50 +537,55 @@ public class Balancer { //System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)); //System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta)); } catch (IOException e1) { - w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); + w = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); } if (w <= 0) { - if (w == Integer.MIN_VALUE && newCandidatesForward > 0) { - // give new domains a chance, but not too much; otherwise a massive downloading of robots.txt from too much domains (dns lock!) will more likely block crawling - newCandidatesForward--; - nextZeroCandidates.set(new AbstractMap.SimpleEntry(entry.getKey(), urlhash), 1000); + if (w == Integer.MIN_VALUE) { + if (newCandidatesForward-- > 0) { + nextZeroCandidates.set(new AbstractMap.SimpleEntry(entry.getKey(), urlhash), 10000); + } else { + failoverCandidates.set(new AbstractMap.SimpleEntry(entry.getKey(), urlhash), 0); + } } else { - nextZeroCandidates.set(new AbstractMap.SimpleEntry(entry.getKey(), urlhash), entry.getValue().size()); + nextZeroCandidates.set(new AbstractMap.SimpleEntry(entry.getKey(), urlhash), entry.getValue().handleSet.size()); } - } - if (w < smallestWaiting || (w == smallestWaiting && this.random.nextBoolean())) { - smallestWaiting = w; - besturlhash = urlhash; - besthost = entry.getKey(); + } else { + failoverCandidates.set(new AbstractMap.SimpleEntry(entry.getKey(), urlhash), w); } } Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size()); - if (besturlhash == null) { - Log.logInfo("Balancer", "*** getbest: besturlhash == null"); - return null; // this should never happen + if (!nextZeroCandidates.isEmpty()) { + // take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates + int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3); + Iterator> k = nextZeroCandidates.keys(false); + while (k.hasNext() && pick-- > 0) { + this.zeroWaitingCandidates.add(k.next()); + } + Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size()); + + return pickFromZeroWaiting(); } - - // best case would be, if we have some zeroWaitingCandidates, - // then we select that one with the largest stack - - if (nextZeroCandidates.isEmpty()) { + + if (!failoverCandidates.isEmpty()) { // bad luck: just take that one with least waiting - removeHashFromDomainStacks(besthost, besturlhash); - Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost); - return besturlhash; - } + Iterator> k = failoverCandidates.keys(true); + String besthost; + byte[] besturlhash; + Map.Entry hosthash; + while (k.hasNext()) { + hosthash = k.next(); + besthost = hosthash.getKey(); + besturlhash = hosthash.getValue(); + removeHashFromDomainStacks(besthost, besturlhash); + Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost); + return besturlhash; + } + } - // now take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates - int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3); - Iterator> k = nextZeroCandidates.keys(false); - while (k.hasNext() && pick-- > 0) { - this.zeroWaitingCandidates.add(k.next()); - } - Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size()); - - return pickFromZeroWaiting(); + Log.logInfo("Balancer", "*** getbest: besturlhash == null"); + return null; // this should never happen } } @@ -579,8 +595,8 @@ public class Balancer { byte[] hash = null; while (this.zeroWaitingCandidates.size() > 0) { Map.Entry z = this.zeroWaitingCandidates.remove(this.random.nextInt(this.zeroWaitingCandidates.size())); - HandleSet hs = this.domainStacks.get(z.getKey()); - if (hs == null) continue; + HostHandles hh = this.domainStacks.get(z.getKey()); + if (hh == null) continue; host = z.getKey(); if (host == null) continue; hash = z.getValue(); if (hash == null) continue; removeHashFromDomainStacks(host, hash); @@ -604,6 +620,7 @@ public class Balancer { String host; Request request; int count = 0; + long timeout = System.currentTimeMillis() + 5000; while (i.hasNext()) { handle = i.next(); final Row.Entry entry = this.urlFileIndex.get(handle, false); @@ -611,12 +628,12 @@ public class Balancer { request = new Request(entry); host = request.url().getHost(); try { - pushHashToDomainStacks(host, handle); + pushHashToDomainStacks(host, request.url().hosthash(), handle); } catch (final SpaceExceededException e) { break; } count++; - if (this.domainStacks.size() >= 100 || (!this.domainStacks.isEmpty() && count > 600 * this.domainStacks.size())) break; + if (this.domainStacks.size() >= 1000 || count >= 100000 || System.currentTimeMillis() > timeout) break; } Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms"); this.domStackInitSize = this.domainStacks.size(); diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index 34227ce30..a938194ca 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -32,6 +32,7 @@ import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Domains; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxtEntry; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.MemoryControl; @@ -47,14 +48,15 @@ public class Latency { * @param url * @param time the time to load the file in milliseconds */ - public static void updateAfterLoad(final MultiProtocolURI url, final long time) { + public static void updateAfterLoad(final DigestURI url, final long time) { final String host = url.getHost(); if (host == null) return; - Host h = map.get(host); + String hosthash = url.hosthash(); + Host h = map.get(hosthash); if (h == null) { h = new Host(host, time); if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear(); - map.put(host, h); + map.put(hosthash, h); } else { h.update(time); } @@ -65,23 +67,24 @@ public class Latency { * @param url * @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist */ - public static void updateAfterSelection(final MultiProtocolURI url, final long robotsCrawlDelay) { + public static void updateAfterSelection(final DigestURI url, final long robotsCrawlDelay) { final String host = url.getHost(); if (host == null) return; - Host h = map.get(host); + String hosthash = url.hosthash(); + Host h = map.get(hosthash); if (h == null) { h = new Host(host, DEFAULT_AVERAGE, robotsCrawlDelay); if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear(); - map.put(host, h); + map.put(hosthash, h); } else { h.update(); } } - private static Host host(final MultiProtocolURI url) { + private static Host host(final DigestURI url) { final String host = url.getHost(); if (host == null) return null; - return map.get(host); + return map.get(url.hosthash()); } public static Iterator> iterator() { @@ -104,41 +107,58 @@ public class Latency { if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer return robotsDelay; } + + private static int waitingRobots(final String hostport, final RobotsTxt robots, final Set thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) { + int robotsDelay = 0; + RobotsTxtEntry robotsEntry = robots.getEntry(hostport, thisAgents, fetchOnlineIfNotAvailableOrNotFresh); + robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); + if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer + return robotsDelay; + } /** * guess a minimum waiting time * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low * also the 'isCGI' property is missing, because the full text of the domain is unknown here * @param hostname + * @param hosthash + * @param robots + * @param thisAgents * @param minimumLocalDelta * @param minimumGlobalDelta * @return the remaining waiting time in milliseconds. The return value may be negative * which expresses how long the time is over the minimum waiting time. */ - public static int waitingRemainingGuessed(final String hostname, final int minimumLocalDelta, final int minimumGlobalDelta) { - if (hostname == null) return Integer.MIN_VALUE; + public static int waitingRemainingGuessed(final String hostname, final String hosthash, final RobotsTxt robots, final Set thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) { // first check if the domain was _ever_ accessed before - final Host host = map.get(hostname); + final Host host = map.get(hosthash); if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere // find the minimum waiting time based on the network domain (local or global) int waiting = (Domains.isLocal(hostname, null)) ? minimumLocalDelta : minimumGlobalDelta; - + // if we have accessed the domain many times, get slower (the flux factor) waiting += host.flux(waiting); - // the time since last access to the domain is the basis of the remaining calculation - final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); - // use the access latency as rule how fast we can access the server // this applies also to localhost, but differently, because it is not necessary to // consider so many external accesses waiting = Math.max(waiting, host.average() * 3 / 2); + // the time since last access to the domain is the basis of the remaining calculation + final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); + + // find the delay as given by robots.txt on target site + if (robots != null) { + int robotsDelay = waitingRobots(hostname + ":80", robots, thisAgents, false); + if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer + waiting = Math.max(waiting, robotsDelay); + } + return Math.min(60000, waiting) - timeSinceLastAccess; } - + /** * calculates how long should be waited until the domain can be accessed again * this follows from: @@ -151,7 +171,7 @@ public class Latency { * @param minimumGlobalDelta * @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time */ - public static int waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) { + public static int waitingRemaining(final DigestURI url, final RobotsTxt robots, final Set thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) { // first check if the domain was _ever_ accessed before final Host host = host(url); @@ -183,9 +203,8 @@ public class Latency { waiting = Math.max(waiting, robotsDelay); return Math.min(60000, waiting) - timeSinceLastAccess; } - - - public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) { + + public static String waitingRemainingExplain(final DigestURI url, final RobotsTxt robots, final Set thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) { // first check if the domain was _ever_ accessed before final Host host = host(url); diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index c98c15b17..40084e90e 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -232,12 +232,12 @@ public class NoticedURL { * get a list of domains that are currently maintained as domain stacks * @return a map of clear text strings of host names to two integers: the size of the domain stacks and the access delta time */ - public Map getDomainStackHosts(final StackType stackType) { + public Map getDomainStackHosts(final StackType stackType, RobotsTxt robots) { switch (stackType) { - case LOCAL: return this.coreStack.getDomainStackHosts(); - case GLOBAL: return this.limitStack.getDomainStackHosts(); - case REMOTE: return this.remoteStack.getDomainStackHosts(); - case NOLOAD: return this.noloadStack.getDomainStackHosts(); + case LOCAL: return this.coreStack.getDomainStackHosts(robots); + case GLOBAL: return this.limitStack.getDomainStackHosts(robots); + case REMOTE: return this.remoteStack.getDomainStackHosts(robots); + case NOLOAD: return this.noloadStack.getDomainStackHosts(robots); default: return null; } } diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index aaeb188cd..aa4b55f65 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -93,12 +93,11 @@ public class RobotsTxt { public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents) { if (theURL == null) throw new IllegalArgumentException(); if (!theURL.getProtocol().startsWith("http")) return null; - return getEntry(theURL, thisAgents, true); + return getEntry(getHostPort(theURL), thisAgents, true); } - private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) { + public RobotsTxtEntry getEntry(final String urlHostPort, final Set thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) { // this method will always return a non-null value - final String urlHostPort = getHostPort(theURL); RobotsTxtEntry robotsTxt4Host = null; Map record; BEncodedHeap robotsTable = null; diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index ff50afec5..8b925f726 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -159,7 +159,7 @@ public class YMarkCrawlStart extends HashMap{ } } - public static String crawlStart( + protected static String crawlStart( final Switchboard sb, final DigestURI startURL, final String urlMustMatch, diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index 15fbba552..30056d76f 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -293,7 +293,7 @@ public class RemoteSearch extends Thread { } } }; - solr.start(); + if (targetPeer == null) solr.run(); else solr.start(); return solr; }