better balancing and duetime-cumputation also for no-delay intranet

hosts
pull/1/head
Michael Peter Christen 13 years ago
parent c326aa8f67
commit 0833937c1c

@ -287,8 +287,8 @@ public class PerformanceQueues_p {
} }
if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) { if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) {
final long minimumLocalDelta = post.getLong("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta()); final int minimumLocalDelta = post.getInt("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); final int minimumGlobalDelta = post.getInt("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
sb.setConfig("minimumLocalDelta", minimumLocalDelta); sb.setConfig("minimumLocalDelta", minimumLocalDelta);
sb.setConfig("minimumGlobalDelta", minimumGlobalDelta); sb.setConfig("minimumGlobalDelta", minimumGlobalDelta);
sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);

@ -70,8 +70,8 @@ public class Balancer {
// class variables filled with external values // class variables filled with external values
private final File cacheStacksPath; private final File cacheStacksPath;
private long minimumLocalDelta; private int minimumLocalDelta;
private long minimumGlobalDelta; private int minimumGlobalDelta;
private final Set<String> myAgentIDs; private final Set<String> myAgentIDs;
private BufferedObjectIndex urlFileIndex; private BufferedObjectIndex urlFileIndex;
@ -86,8 +86,8 @@ public class Balancer {
public Balancer( public Balancer(
final File cachePath, final File cachePath,
final String stackname, final String stackname,
final long minimumLocalDelta, final int minimumLocalDelta,
final long minimumGlobalDelta, final int minimumGlobalDelta,
final Set<String> myAgentIDs, final Set<String> myAgentIDs,
final boolean useTailCache, final boolean useTailCache,
final boolean exceed134217727) { final boolean exceed134217727) {
@ -118,15 +118,15 @@ public class Balancer {
Log.logInfo("Balancer", "opened balancer file with " + this.urlFileIndex.size() + " entries from " + f.toString()); Log.logInfo("Balancer", "opened balancer file with " + this.urlFileIndex.size() + " entries from " + f.toString());
} }
public long getMinimumLocalDelta() { public int getMinimumLocalDelta() {
return this.minimumLocalDelta; return this.minimumLocalDelta;
} }
public long getMinimumGlobalDelta() { public int getMinimumGlobalDelta() {
return this.minimumGlobalDelta; return this.minimumGlobalDelta;
} }
public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) { public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) {
this.minimumLocalDelta = minimumLocalDelta; this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta; this.minimumGlobalDelta = minimumGlobalDelta;
} }
@ -289,7 +289,7 @@ public class Balancer {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) { for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
int size = entry.getValue().size(); int size = entry.getValue().size();
int delta = (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); int delta = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
map.put(entry.getKey(), new Integer[]{size, delta}); map.put(entry.getKey(), new Integer[]{size, delta});
} }
return map; return map;
@ -297,7 +297,7 @@ public class Balancer {
/** /**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Long.MIN_VALUE to show that there should not be any limitation at all. * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots * @param robots
* @param profileEntry * @param profileEntry
* @param crawlURL * @param crawlURL
@ -616,7 +616,7 @@ public class Balancer {
break; break;
} }
count++; count++;
if (!this.domainStacks.isEmpty() && count > 120 * this.domainStacks.size()) break; if (this.domainStacks.size() >= 100 || (!this.domainStacks.isEmpty() && count > 600 * this.domainStacks.size())) break;
} }
Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms"); Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms");
this.domStackInitSize = this.domainStacks.size(); this.domStackInitSize = this.domainStacks.size();

@ -97,8 +97,8 @@ public class Latency {
* @param thisAgents * @param thisAgents
* @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights * @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights
*/ */
public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) { public static int waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
long robotsDelay = 0; int robotsDelay = 0;
RobotsTxtEntry robotsEntry = robots.getEntry(url, thisAgents); RobotsTxtEntry robotsEntry = robots.getEntry(url, thisAgents);
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
@ -115,7 +115,7 @@ public class Latency {
* @return the remaining waiting time in milliseconds. The return value may be negative * @return the remaining waiting time in milliseconds. The return value may be negative
* which expresses how long the time is over the minimum waiting time. * which expresses how long the time is over the minimum waiting time.
*/ */
public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) { public static int waitingRemainingGuessed(final String hostname, final int minimumLocalDelta, final int minimumGlobalDelta) {
if (hostname == null) return Integer.MIN_VALUE; if (hostname == null) return Integer.MIN_VALUE;
// first check if the domain was _ever_ accessed before // first check if the domain was _ever_ accessed before
@ -123,15 +123,13 @@ public class Latency {
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
final boolean local = Domains.isLocal(hostname, null); int waiting = (Domains.isLocal(hostname, null)) ? minimumLocalDelta : minimumGlobalDelta;
if (local) return minimumLocalDelta;
long waiting = minimumGlobalDelta;
// if we have accessed the domain many times, get slower (the flux factor) // if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting); waiting += host.flux(waiting);
// the time since last access to the domain is the basis of the remaining calculation // the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// use the access latency as rule how fast we can access the server // use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to // this applies also to localhost, but differently, because it is not necessary to
@ -153,16 +151,14 @@ public class Latency {
* @param minimumGlobalDelta * @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time * @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
*/ */
public static long waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) { public static int waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before // first check if the domain was _ever_ accessed before
final Host host = host(url); final Host host = host(url);
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal(); int waiting = (url.isLocal()) ? minimumLocalDelta : minimumGlobalDelta;
if (local) return minimumLocalDelta;
long waiting = minimumGlobalDelta;
// for CGI accesses, we double the minimum time // for CGI accesses, we double the minimum time
// mostly there is a database access in the background // mostly there is a database access in the background
@ -178,10 +174,10 @@ public class Latency {
waiting = Math.max(waiting, host.average() * 2); waiting = Math.max(waiting, host.average() * 2);
// the time since last access to the domain is the basis of the remaining calculation // the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// find the delay as given by robots.txt on target site // find the delay as given by robots.txt on target site
long robotsDelay = waitingRobots(url, robots, thisAgents); int robotsDelay = waitingRobots(url, robots, thisAgents);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay); waiting = Math.max(waiting, robotsDelay);
@ -189,18 +185,16 @@ public class Latency {
} }
public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) { public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before // first check if the domain was _ever_ accessed before
final Host host = host(url); final Host host = host(url);
if (host == null) return "host " + host + " never accessed before -> Long.MIN_VALUE"; // no delay if host is new if (host == null) return "host " + host + " never accessed before -> Integer.MIN_VALUE"; // no delay if host is new
final StringBuilder s = new StringBuilder(50); final StringBuilder s = new StringBuilder(50);
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal(); int waiting = (url.isLocal()) ? minimumLocalDelta : minimumGlobalDelta;
if (local) return "local host -> minimum local: " + minimumLocalDelta;
long waiting = minimumGlobalDelta;
s.append("minimumDelta = ").append(waiting); s.append("minimumDelta = ").append(waiting);
// for CGI accesses, we double the minimum time // for CGI accesses, we double the minimum time
@ -209,7 +203,7 @@ public class Latency {
if (url.isCGI()) { waiting = waiting * 2; s.append(", isCGI = true -> double"); } if (url.isCGI()) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
// if we have accessed the domain many times, get slower (the flux factor) // if we have accessed the domain many times, get slower (the flux factor)
long flux = host.flux(waiting); int flux = host.flux(waiting);
waiting += flux; waiting += flux;
s.append(", flux = ").append(flux); s.append(", flux = ").append(flux);
@ -220,7 +214,7 @@ public class Latency {
waiting = Math.max(waiting, host.average() * 2); waiting = Math.max(waiting, host.average() * 2);
// find the delay as given by robots.txt on target site // find the delay as given by robots.txt on target site
long robotsDelay = waitingRobots(url, robots, thisAgents); int robotsDelay = waitingRobots(url, robots, thisAgents);
if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay); waiting = Math.max(waiting, robotsDelay);
@ -273,7 +267,7 @@ public class Latency {
public long robotsDelay() { public long robotsDelay() {
return this.robotsMinDelay; return this.robotsMinDelay;
} }
public long flux(final long range) { public int flux(final int range) {
return this.count >= 10000 ? range * Math.min(5000, this.count) / 10000 : range / (10000 - this.count); return this.count >= 10000 ? range * Math.min(5000, this.count) / 10000 : range / (10000 - this.count);
} }
} }

@ -51,8 +51,8 @@ public class NoticedURL {
LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD; LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD;
} }
private static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain private static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
private Balancer coreStack; // links found by crawling to depth-1 private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth private Balancer limitStack; // links found by crawling at target depth
@ -72,15 +72,15 @@ public class NoticedURL {
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727); this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
} }
public long getMinimumLocalDelta() { public int getMinimumLocalDelta() {
return this.coreStack.getMinimumLocalDelta(); return this.coreStack.getMinimumLocalDelta();
} }
public long getMinimumGlobalDelta() { public int getMinimumGlobalDelta() {
return this.coreStack.getMinimumGlobalDelta(); return this.coreStack.getMinimumGlobalDelta();
} }
public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) { public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) {
this.coreStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); this.coreStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.limitStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); this.limitStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.remoteStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); this.remoteStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);

@ -205,14 +205,14 @@ public class RobotsTxtEntry {
return null; return null;
} }
public long getCrawlDelayMillis() { public int getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try { if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS)); return (int) ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
} catch (final NumberFormatException e) { } catch (final NumberFormatException e) {
return 0; return 0;
} }
if (this.mem.containsKey(CRAWL_DELAY)) try { if (this.mem.containsKey(CRAWL_DELAY)) try {
return 1000 * ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY)); return 1000 * ((int) ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY)));
} catch (final NumberFormatException e) { } catch (final NumberFormatException e) {
return 0; return 0;
} }

@ -727,8 +727,8 @@ public final class Switchboard extends serverSwitch
OAIListFriendsLoader.init(this.loader, oaiFriends); OAIListFriendsLoader.init(this.loader, oaiFriends);
this.crawlQueues = new CrawlQueues(this, this.queuesRoot); this.crawlQueues = new CrawlQueues(this, this.queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta( this.crawlQueues.noticeURL.setMinimumDelta(
getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()), getConfigInt("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta())); getConfigInt("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
/* /*
* Creating sync objects and loading status for the crawl jobs * Creating sync objects and loading status for the crawl jobs

@ -92,11 +92,7 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
super(); super();
} }
public serverObjects(final int initialCapacity) { protected serverObjects(final Map<String, String> input) {
super(initialCapacity);
}
public serverObjects(final Map<String, String> input) {
super(input); super(input);
} }
@ -219,10 +215,6 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
return put(key, toJSON(value)); return put(key, toJSON(value));
} }
public String putJSON(final String key, final StringBuilder value) {
return put(key, toJSON(value.toString()));
}
public static String toJSON(String value) { public static String toJSON(String value) {
// value = value.replaceAll("\\", "\\\\"); // value = value.replaceAll("\\", "\\\\");
value = patternDoublequote.matcher(value).replaceAll("'"); value = patternDoublequote.matcher(value).replaceAll("'");
@ -235,10 +227,6 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
return value; return value;
} }
public String putJSON(final String key, final byte[] value) {
return putJSON(key, UTF8.String(value));
}
/** /**
* Add a String to the map. The content of the String is escaped to be usable in HTML output. * Add a String to the map. The content of the String is escaped to be usable in HTML output.
* @param key key name as String. * @param key key name as String.
@ -386,11 +374,6 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
return s.equals("true") || s.equals("on") || s.equals("1"); return s.equals("true") || s.equals("on") || s.equals("1");
} }
public boolean hasValue(final String key) {
final String s = super.get(key);
return (s != null && !s.isEmpty());
}
// returns a set of all values where their key mappes the keyMapper // returns a set of all values where their key mappes the keyMapper
public String[] getAll(final String keyMapper) { public String[] getAll(final String keyMapper) {
// the keyMapper may contain regular expressions as defined in String.matches // the keyMapper may contain regular expressions as defined in String.matches

Loading…
Cancel
Save