From 0833937c1c1cfbeaee28ec2d126e2b4e29bcae6e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 30 Oct 2012 11:28:49 +0100 Subject: [PATCH] better balancing and duetime-cumputation also for no-delay intranet hosts --- htroot/PerformanceQueues_p.java | 4 +- source/net/yacy/crawler/Balancer.java | 20 +++++----- source/net/yacy/crawler/data/Latency.java | 38 ++++++++----------- source/net/yacy/crawler/data/NoticedURL.java | 10 ++--- .../yacy/crawler/robots/RobotsTxtEntry.java | 6 +-- source/net/yacy/search/Switchboard.java | 4 +- source/net/yacy/server/serverObjects.java | 19 +--------- 7 files changed, 39 insertions(+), 62 deletions(-) diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index e51526dbb..a24cb8adc 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -287,8 +287,8 @@ public class PerformanceQueues_p { } if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) { - final long minimumLocalDelta = post.getLong("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta()); - final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); + final int minimumLocalDelta = post.getInt("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta()); + final int minimumGlobalDelta = post.getInt("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); sb.setConfig("minimumLocalDelta", minimumLocalDelta); sb.setConfig("minimumGlobalDelta", minimumGlobalDelta); sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index 5389260f3..e54f455c1 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -70,8 +70,8 @@ public class Balancer { // class variables filled with external values private final File cacheStacksPath; - private long minimumLocalDelta; - private long minimumGlobalDelta; + private int minimumLocalDelta; + private int minimumGlobalDelta; private final Set myAgentIDs; private BufferedObjectIndex urlFileIndex; @@ -86,8 +86,8 @@ public class Balancer { public Balancer( final File cachePath, final String stackname, - final long minimumLocalDelta, - final long minimumGlobalDelta, + final int minimumLocalDelta, + final int minimumGlobalDelta, final Set myAgentIDs, final boolean useTailCache, final boolean exceed134217727) { @@ -118,15 +118,15 @@ public class Balancer { Log.logInfo("Balancer", "opened balancer file with " + this.urlFileIndex.size() + " entries from " + f.toString()); } - public long getMinimumLocalDelta() { + public int getMinimumLocalDelta() { return this.minimumLocalDelta; } - public long getMinimumGlobalDelta() { + public int getMinimumGlobalDelta() { return this.minimumGlobalDelta; } - public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) { + public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) { this.minimumLocalDelta = minimumLocalDelta; this.minimumGlobalDelta = minimumGlobalDelta; } @@ -289,7 +289,7 @@ public class Balancer { Map map = new TreeMap(); // we use a tree map to get a stable ordering for (Map.Entry entry: this.domainStacks.entrySet()) { int size = entry.getValue().size(); - int delta = (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); + int delta = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); map.put(entry.getKey(), new Integer[]{size, delta}); } return map; @@ -297,7 +297,7 @@ public class Balancer { /** * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access - * The time can be as low as Long.MIN_VALUE to show that there should not be any limitation at all. + * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all. * @param robots * @param profileEntry * @param crawlURL @@ -616,7 +616,7 @@ public class Balancer { break; } count++; - if (!this.domainStacks.isEmpty() && count > 120 * this.domainStacks.size()) break; + if (this.domainStacks.size() >= 100 || (!this.domainStacks.isEmpty() && count > 600 * this.domainStacks.size())) break; } Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms"); this.domStackInitSize = this.domainStacks.size(); diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index cde93d97b..c152b70b0 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -97,8 +97,8 @@ public class Latency { * @param thisAgents * @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights */ - public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents) { - long robotsDelay = 0; + public static int waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents) { + int robotsDelay = 0; RobotsTxtEntry robotsEntry = robots.getEntry(url, thisAgents); robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer @@ -115,7 +115,7 @@ public class Latency { * @return the remaining waiting time in milliseconds. The return value may be negative * which expresses how long the time is over the minimum waiting time. */ - public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) { + public static int waitingRemainingGuessed(final String hostname, final int minimumLocalDelta, final int minimumGlobalDelta) { if (hostname == null) return Integer.MIN_VALUE; // first check if the domain was _ever_ accessed before @@ -123,15 +123,13 @@ public class Latency { if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere // find the minimum waiting time based on the network domain (local or global) - final boolean local = Domains.isLocal(hostname, null); - if (local) return minimumLocalDelta; - long waiting = minimumGlobalDelta; - + int waiting = (Domains.isLocal(hostname, null)) ? minimumLocalDelta : minimumGlobalDelta; + // if we have accessed the domain many times, get slower (the flux factor) waiting += host.flux(waiting); // the time since last access to the domain is the basis of the remaining calculation - final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); + final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); // use the access latency as rule how fast we can access the server // this applies also to localhost, but differently, because it is not necessary to @@ -153,16 +151,14 @@ public class Latency { * @param minimumGlobalDelta * @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time */ - public static long waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) { + public static int waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) { // first check if the domain was _ever_ accessed before final Host host = host(url); if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere // find the minimum waiting time based on the network domain (local or global) - final boolean local = url.isLocal(); - if (local) return minimumLocalDelta; - long waiting = minimumGlobalDelta; + int waiting = (url.isLocal()) ? minimumLocalDelta : minimumGlobalDelta; // for CGI accesses, we double the minimum time // mostly there is a database access in the background @@ -178,10 +174,10 @@ public class Latency { waiting = Math.max(waiting, host.average() * 2); // the time since last access to the domain is the basis of the remaining calculation - final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); + final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); // find the delay as given by robots.txt on target site - long robotsDelay = waitingRobots(url, robots, thisAgents); + int robotsDelay = waitingRobots(url, robots, thisAgents); if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer waiting = Math.max(waiting, robotsDelay); @@ -189,18 +185,16 @@ public class Latency { } - public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) { + public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) { // first check if the domain was _ever_ accessed before final Host host = host(url); - if (host == null) return "host " + host + " never accessed before -> Long.MIN_VALUE"; // no delay if host is new + if (host == null) return "host " + host + " never accessed before -> Integer.MIN_VALUE"; // no delay if host is new final StringBuilder s = new StringBuilder(50); // find the minimum waiting time based on the network domain (local or global) - final boolean local = url.isLocal(); - if (local) return "local host -> minimum local: " + minimumLocalDelta; - long waiting = minimumGlobalDelta; + int waiting = (url.isLocal()) ? minimumLocalDelta : minimumGlobalDelta; s.append("minimumDelta = ").append(waiting); // for CGI accesses, we double the minimum time @@ -209,7 +203,7 @@ public class Latency { if (url.isCGI()) { waiting = waiting * 2; s.append(", isCGI = true -> double"); } // if we have accessed the domain many times, get slower (the flux factor) - long flux = host.flux(waiting); + int flux = host.flux(waiting); waiting += flux; s.append(", flux = ").append(flux); @@ -220,7 +214,7 @@ public class Latency { waiting = Math.max(waiting, host.average() * 2); // find the delay as given by robots.txt on target site - long robotsDelay = waitingRobots(url, robots, thisAgents); + int robotsDelay = waitingRobots(url, robots, thisAgents); if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer waiting = Math.max(waiting, robotsDelay); @@ -273,7 +267,7 @@ public class Latency { public long robotsDelay() { return this.robotsMinDelay; } - public long flux(final long range) { + public int flux(final int range) { return this.count >= 10000 ? range * Math.min(5000, this.count) / 10000 : range / (10000 - this.count); } } diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index b78315c11..c98c15b17 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -51,8 +51,8 @@ public class NoticedURL { LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD; } - private static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain - public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain + private static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain + public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain private Balancer coreStack; // links found by crawling to depth-1 private Balancer limitStack; // links found by crawling at target depth @@ -72,15 +72,15 @@ public class NoticedURL { this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727); } - public long getMinimumLocalDelta() { + public int getMinimumLocalDelta() { return this.coreStack.getMinimumLocalDelta(); } - public long getMinimumGlobalDelta() { + public int getMinimumGlobalDelta() { return this.coreStack.getMinimumGlobalDelta(); } - public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) { + public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) { this.coreStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); this.limitStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); this.remoteStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); diff --git a/source/net/yacy/crawler/robots/RobotsTxtEntry.java b/source/net/yacy/crawler/robots/RobotsTxtEntry.java index 014dcf538..bcb640ed1 100644 --- a/source/net/yacy/crawler/robots/RobotsTxtEntry.java +++ b/source/net/yacy/crawler/robots/RobotsTxtEntry.java @@ -205,14 +205,14 @@ public class RobotsTxtEntry { return null; } - public long getCrawlDelayMillis() { + public int getCrawlDelayMillis() { if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try { - return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS)); + return (int) ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS)); } catch (final NumberFormatException e) { return 0; } if (this.mem.containsKey(CRAWL_DELAY)) try { - return 1000 * ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY)); + return 1000 * ((int) ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY))); } catch (final NumberFormatException e) { return 0; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index a0407fbe5..17993c2e3 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -727,8 +727,8 @@ public final class Switchboard extends serverSwitch OAIListFriendsLoader.init(this.loader, oaiFriends); this.crawlQueues = new CrawlQueues(this, this.queuesRoot); this.crawlQueues.noticeURL.setMinimumDelta( - getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()), - getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta())); + getConfigInt("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()), + getConfigInt("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta())); /* * Creating sync objects and loading status for the crawl jobs diff --git a/source/net/yacy/server/serverObjects.java b/source/net/yacy/server/serverObjects.java index 0f288e4a2..52b091ca1 100644 --- a/source/net/yacy/server/serverObjects.java +++ b/source/net/yacy/server/serverObjects.java @@ -92,11 +92,7 @@ public class serverObjects extends HashMap implements Cloneable super(); } - public serverObjects(final int initialCapacity) { - super(initialCapacity); - } - - public serverObjects(final Map input) { + protected serverObjects(final Map input) { super(input); } @@ -219,10 +215,6 @@ public class serverObjects extends HashMap implements Cloneable return put(key, toJSON(value)); } - public String putJSON(final String key, final StringBuilder value) { - return put(key, toJSON(value.toString())); - } - public static String toJSON(String value) { // value = value.replaceAll("\\", "\\\\"); value = patternDoublequote.matcher(value).replaceAll("'"); @@ -235,10 +227,6 @@ public class serverObjects extends HashMap implements Cloneable return value; } - public String putJSON(final String key, final byte[] value) { - return putJSON(key, UTF8.String(value)); - } - /** * Add a String to the map. The content of the String is escaped to be usable in HTML output. * @param key key name as String. @@ -386,11 +374,6 @@ public class serverObjects extends HashMap implements Cloneable return s.equals("true") || s.equals("on") || s.equals("1"); } - public boolean hasValue(final String key) { - final String s = super.get(key); - return (s != null && !s.isEmpty()); - } - // returns a set of all values where their key mappes the keyMapper public String[] getAll(final String keyMapper) { // the keyMapper may contain regular expressions as defined in String.matches