update to Balancer algorithm:

- create a load list from the current list of known hosts
- do not create this list for each Balancer.pop access
- create the list from those hosts which have a zero-waiting time
- select 1/3 from that list which have the most urls waiting
- get hosts from the wainting list in random order
- fixes for some delta-time computations
- always load all urls from hosts which have never been loaded before
pull/1/head
orbiter 13 years ago
parent 354f0d9acd
commit 8952153ecf

@ -130,7 +130,7 @@ public class IndexCreateQueues_p {
prop.put("crawler_host_" + hc + "_urlsPerHost", urlsPerHost); prop.put("crawler_host_" + hc + "_urlsPerHost", urlsPerHost);
prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name()); prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name());
prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]); prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]);
prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1]); prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1] == Integer.MIN_VALUE ? "not accessed" : Integer.toString(host.getValue()[1]));
List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost); List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost);
Seed initiator; Seed initiator;

@ -28,11 +28,12 @@ package net.yacy.crawler;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.AbstractMap;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Random;
import java.util.Set; import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
@ -44,6 +45,7 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.Cache;
@ -78,6 +80,8 @@ public class Balancer {
private final HandleSet double_push_check; // for debugging private final HandleSet double_push_check; // for debugging
private long lastDomainStackFill; private long lastDomainStackFill;
private int domStackInitSize; private int domStackInitSize;
private final List<Map.Entry<String, byte[]>> zeroWaitingCandidates;
private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting
public Balancer( public Balancer(
final File cachePath, final File cachePath,
@ -94,7 +98,9 @@ public class Balancer {
this.myAgentIDs = myAgentIDs; this.myAgentIDs = myAgentIDs;
this.domStackInitSize = Integer.MAX_VALUE; this.domStackInitSize = Integer.MAX_VALUE;
this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.zeroWaitingCandidates = new ArrayList<Map.Entry<String, byte[]>>();
this.random = new Random(System.currentTimeMillis());
// create a stack for newly entered entries // create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path if (!(cachePath.exists())) cachePath.mkdir(); // make the path
this.cacheStacksPath.mkdirs(); this.cacheStacksPath.mkdirs();
@ -205,6 +211,12 @@ public class Balancer {
for (final byte[] handle: urlHashes) stack.remove(handle); for (final byte[] handle: urlHashes) stack.remove(handle);
if (stack.isEmpty()) q.remove(); if (stack.isEmpty()) q.remove();
} }
// iterate through zero-waiting map
final Iterator<Map.Entry<String, byte[]>> i = this.zeroWaitingCandidates.iterator();
while (i.hasNext()) {
if (urlHashes.has(i.next().getValue())) i.remove();
}
return removedCounter; return removedCounter;
} }
@ -274,32 +286,35 @@ public class Balancer {
public Map<String, Integer[]> getDomainStackHosts() { public Map<String, Integer[]> getDomainStackHosts() {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) { for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
map.put(entry.getKey(), new Integer[]{entry.getValue().size(), (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)}); int size = entry.getValue().size();
int delta = (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
map.put(entry.getKey(), new Integer[]{size, delta});
} }
return map; return map;
} }
/** /**
* compute the current sleep time for a given crawl entry * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* @param cs * The time can be as low as Long.MIN_VALUE to show that there should not be any limitation at all.
* @param crawlEntry * @param robots
* @return * @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/ */
public long getDomainSleepTime(final CrawlSwitchboard cs, final RobotsTxt robots, Request crawlEntry) {
final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
return getDomainSleepTime(robots, profileEntry, crawlEntry.url());
}
private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) { private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) {
if (profileEntry == null) { if (profileEntry == null) return 0;
return 0;
}
long sleeptime = ( long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY || profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash())) (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? 0 : Latency.waitingRemaining(crawlURL, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server ) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime; return sleeptime;
} }
private long getRobotsTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = Latency.waitingRobots(crawlURL, robots, this.myAgentIDs); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
/** /**
* get lists of crawl request entries for a specific host * get lists of crawl request entries for a specific host
@ -377,6 +392,7 @@ public class Balancer {
long sleeptime = 0; long sleeptime = 0;
Request crawlEntry = null; Request crawlEntry = null;
CrawlProfile profileEntry = null;
synchronized (this) { synchronized (this) {
byte[] failhash = null; byte[] failhash = null;
while (!this.urlFileIndex.isEmpty()) { while (!this.urlFileIndex.isEmpty()) {
@ -408,7 +424,7 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists // at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again // if not: return null. A calling method must handle the null value and try again
final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) { if (profileEntry == null) {
Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
return null; return null;
@ -425,6 +441,8 @@ public class Balancer {
} }
if (crawlEntry == null) return null; if (crawlEntry == null) return null;
long robotsTime = getRobotsTime(robots, profileEntry, crawlEntry.url());
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) { if (delay && sleeptime > 0) {
// force a busy waiting here // force a busy waiting here
// in best case, this should never happen if the balancer works propertly // in best case, this should never happen if the balancer works propertly
@ -442,96 +460,119 @@ public class Balancer {
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
try {this.wait(1000); } catch (final InterruptedException e) {} try {this.wait(1000); } catch (final InterruptedException e) {}
} }
Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
} }
Latency.update(crawlEntry.url());
return crawlEntry; return crawlEntry;
} }
private byte[] getbest(final RobotsTxt robots) { private byte[] getbest(final RobotsTxt robots) {
// check if we need to get entries from the file index synchronized (this.zeroWaitingCandidates) {
try { if (this.zeroWaitingCandidates.size() > 0) {
fillDomainStacks(); byte[] urlhash = pickFromZeroWaiting();
} catch (final IOException e) { if (urlhash != null) return urlhash;
Log.logException(e);
}
// iterate over the domain stacks
final Iterator<Map.Entry<String, HandleSet>> i = this.domainStacks.entrySet().iterator();
Map.Entry<String, HandleSet> entry;
long smallestWaiting = Long.MAX_VALUE;
byte[] besturlhash = null;
String besthost = null;
Map<String, byte[]> zeroWaitingCandidates = new HashMap<String, byte[]>();
while (i.hasNext() && zeroWaitingCandidates.size() < 10) {
entry = i.next();
// clean up empty entries
if (entry.getValue().isEmpty()) {
i.remove();
continue;
} }
this.zeroWaitingCandidates.clear();
final byte[] n = entry.getValue().removeOne();
if (n == null) continue; // check if we need to get entries from the file index
try {
long w; fillDomainStacks();
Row.Entry rowEntry; } catch (final IOException e) {
try { Log.logException(e);
rowEntry = this.urlFileIndex.get(n, false); }
if (rowEntry == null) {
continue; // iterate over the domain stacks
final Iterator<Map.Entry<String, HandleSet>> i = this.domainStacks.entrySet().iterator();
Map.Entry<String, HandleSet> entry;
long smallestWaiting = Long.MAX_VALUE;
byte[] besturlhash = null;
String besthost = null;
OrderedScoreMap<Map.Entry<String, byte[]>> nextZeroCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
while (i.hasNext() && nextZeroCandidates.size() < 1000) {
entry = i.next();
// clean up empty entries
if (entry.getValue().isEmpty()) {
i.remove();
continue;
}
final byte[] urlhash = entry.getValue().getOne(0);
if (urlhash == null) continue;
long w;
Row.Entry rowEntry;
try {
rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) {
continue;
}
Request crawlEntry = new Request(rowEntry);
w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
} catch (IOException e1) {
w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
} }
Request crawlEntry = new Request(rowEntry);
w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
} catch (IOException e1) {
w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
}
if (w < smallestWaiting) {
smallestWaiting = w;
besturlhash = n;
besthost = entry.getKey();
if (w <= 0) { if (w <= 0) {
zeroWaitingCandidates.put(besthost, besturlhash); nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), w == Integer.MIN_VALUE ? 1000 /* get new domains a chance */ : entry.getValue().size());
} }
} if (w < smallestWaiting || (w == smallestWaiting && this.random.nextBoolean())) {
try { smallestWaiting = w;
entry.getValue().put(n); // put entry back, we are checking only besturlhash = urlhash;
} catch (SpaceExceededException e) { besthost = entry.getKey();
e.printStackTrace(); }
} }
} Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
if (besturlhash == null) {
Log.logInfo("Balancer", "*** getbest: besturlhash == null");
return null; // this should never happen
}
// best case would be, if we have some zeroWaitingCandidates,
// then we select that one with the largest stack
if (nextZeroCandidates.isEmpty()) {
// bad luck: just take that one with least waiting
removeHashFromDomainStacks(besthost, besturlhash);
Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost);
return besturlhash;
}
// now take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates
int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3);
Iterator<Map.Entry<String, byte[]>> k = nextZeroCandidates.keys(false);
while (k.hasNext() && pick-- > 0) {
this.zeroWaitingCandidates.add(k.next());
}
Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
return pickFromZeroWaiting();
}
}
if (besturlhash == null) return null; // worst case private byte[] pickFromZeroWaiting() {
// by random we choose now either from the largest stack or from any of the other stacks
// best case would be, if we have some zeroWaitingCandidates, String host = null;
// then we select that one with the largest stack byte[] hash = null;
if (!zeroWaitingCandidates.isEmpty()) { while (this.zeroWaitingCandidates.size() > 0) {
int largestStack = -1; Map.Entry<String, byte[]> z = this.zeroWaitingCandidates.remove(this.random.nextInt(this.zeroWaitingCandidates.size()));
String largestStackHost = null; HandleSet hs = this.domainStacks.get(z.getKey());
byte[] largestStackHash = null; if (hs == null) continue;
for (Map.Entry<String, byte[]> z: zeroWaitingCandidates.entrySet()) { host = z.getKey(); if (host == null) continue;
HandleSet hs = this.domainStacks.get(z.getKey()); hash = z.getValue(); if (hash == null) continue;
if (hs == null || hs.size() <= largestStack) continue; removeHashFromDomainStacks(host, hash);
largestStack = hs.size(); Log.logInfo("Balancer", "*** getbest: picked a random from the zero-waiting stack: " + host + ", zeroWaitingCandidates.size = " + this.zeroWaitingCandidates.size());
largestStackHost = z.getKey(); return hash;
largestStackHash = z.getValue(); }
}
if (largestStackHost != null && largestStackHash != null) {
removeHashFromDomainStacks(largestStackHost, largestStackHash);
//Log.logInfo("Balancer", "*** picked one from largest stack");
return largestStackHash;
}
}
// default case: just take that one with least waiting Log.logInfo("Balancer", "*** getbest: picking from zero-waiting stack failed!" + " zeroWaitingCandidates.size = " + this.zeroWaitingCandidates.size());
removeHashFromDomainStacks(besthost, besturlhash); this.zeroWaitingCandidates.clear();
return besturlhash; return null;
} }
private void fillDomainStacks() throws IOException { private void fillDomainStacks() throws IOException {
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return; if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return;
this.domainStacks.clear(); this.domainStacks.clear();

@ -43,7 +43,12 @@ public class Latency {
// the map is a mapping from host names to host configurations // the map is a mapping from host names to host configurations
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>(); private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
public static void update(final MultiProtocolURI url, final long time) { /**
* update the latency entry after a host was accessed to load a file
* @param url
* @param time the time to load the file in milliseconds
*/
public static void updateAfterLoad(final MultiProtocolURI url, final long time) {
final String host = url.getHost(); final String host = url.getHost();
if (host == null) return; if (host == null) return;
Host h = map.get(host); Host h = map.get(host);
@ -56,12 +61,17 @@ public class Latency {
} }
} }
public static void update(final MultiProtocolURI url) { /**
* update the latency entry after a host was selected for queueing into the loader
* @param url
* @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist
*/
public static void updateAfterSelection(final MultiProtocolURI url, final long robotsCrawlDelay) {
final String host = url.getHost(); final String host = url.getHost();
if (host == null) return; if (host == null) return;
Host h = map.get(host); Host h = map.get(host);
if (h == null) { if (h == null) {
h = new Host(host, DEFAULT_AVERAGE); h = new Host(host, DEFAULT_AVERAGE, robotsCrawlDelay);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear(); if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h); map.put(host, h);
} else { } else {
@ -69,51 +79,38 @@ public class Latency {
} }
} }
public static void slowdown(final MultiProtocolURI url) { private static Host host(final MultiProtocolURI url) {
final String host = url.getHost();
if (host == null) return;
Host h = map.get(host);
if (h == null) {
h = new Host(host, DEFAULT_AVERAGE);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h);
} else {
h.slowdown();
}
}
public static Host host(final MultiProtocolURI url) {
final String host = url.getHost(); final String host = url.getHost();
if (host == null) return null; if (host == null) return null;
return map.get(host); return map.get(host);
} }
public static int average(final MultiProtocolURI url) {
final String host = url.getHost();
if (host == null) return 0;
final Host h = map.get(host);
if (h == null) return 0;
return h.average();
}
public static Iterator<Map.Entry<String, Host>> iterator() { public static Iterator<Map.Entry<String, Host>> iterator() {
return map.entrySet().iterator(); return map.entrySet().iterator();
} }
/** /**
* calculate the time since the last access of the domain as referenced by the url hash * Return the waiting time demanded by the robots.txt file of the target host.
* @param urlhash * A special case is, if the remote host has a special crawl-delay assignment for
* @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before * this crawler with 0. This causes that a -1 is returned
* @param url
* @param robots
* @param thisAgents
* @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights
*/ */
public static long lastAccessDelta(final MultiProtocolURI url) { public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
final Latency.Host host = Latency.host(url); long robotsDelay = 0;
if (host == null) return Long.MAX_VALUE; // never accessed RobotsTxtEntry robotsEntry;
return System.currentTimeMillis() - host.lastacc(); try {
robotsEntry = robots.getEntry(url, thisAgents);
} catch (final IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;
} }
/** /**
* guess a minimum waiting time * guess a minimum waiting time
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
@ -125,11 +122,11 @@ public class Latency {
* which expresses how long the time is over the minimum waiting time. * which expresses how long the time is over the minimum waiting time.
*/ */
public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) { public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) {
if (hostname == null) return Long.MIN_VALUE; if (hostname == null) return Integer.MIN_VALUE;
// first check if the domain was _ever_ accessed before // first check if the domain was _ever_ accessed before
final Host host = map.get(hostname); final Host host = map.get(hostname);
if (host == null) return Long.MIN_VALUE; // no delay if host is new if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
final boolean local = Domains.isLocal(hostname, null); final boolean local = Domains.isLocal(hostname, null);
@ -139,14 +136,15 @@ public class Latency {
// if we have accessed the domain many times, get slower (the flux factor) // if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting); waiting += host.flux(waiting);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// use the access latency as rule how fast we can access the server // use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to // this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses // consider so many external accesses
waiting = Math.max(waiting, host.average() * 2); waiting = Math.max(waiting, host.average() * 2);
// the time since last access to the domain is the basis of the remaining calculation return Math.min(60000, waiting) - timeSinceLastAccess;
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
return Math.max(0, Math.min(60000, waiting) - timeSinceLastAccess);
} }
/** /**
@ -159,13 +157,13 @@ public class Latency {
* - and a given minimum access time as given in robots.txt * - and a given minimum access time as given in robots.txt
* @param minimumLocalDelta * @param minimumLocalDelta
* @param minimumGlobalDelta * @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds * @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
*/ */
public static long waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) { public static long waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before // first check if the domain was _ever_ accessed before
final Host host = host(url); final Host host = host(url);
if (host == null) return Long.MIN_VALUE; // no delay if host is new if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal(); final boolean local = url.isLocal();
@ -185,22 +183,15 @@ public class Latency {
// consider so many external accesses // consider so many external accesses
waiting = Math.max(waiting, host.average() * 2); waiting = Math.max(waiting, host.average() * 2);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// find the delay as given by robots.txt on target site // find the delay as given by robots.txt on target site
long robotsDelay = 0; long robotsDelay = waitingRobots(url, robots, thisAgents);
RobotsTxtEntry robotsEntry; if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
try {
robotsEntry = robots.getEntry(url, thisAgents);
} catch (final IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return 0; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay); waiting = Math.max(waiting, robotsDelay);
return Math.min(60000, waiting) - timeSinceLastAccess;
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
return Math.max(0, Math.min(60000, waiting) - timeSinceLastAccess);
} }
@ -235,15 +226,8 @@ public class Latency {
waiting = Math.max(waiting, host.average() * 2); waiting = Math.max(waiting, host.average() * 2);
// find the delay as given by robots.txt on target site // find the delay as given by robots.txt on target site
long robotsDelay = 0; long robotsDelay = waitingRobots(url, robots, thisAgents);
RobotsTxtEntry robotsEntry; if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
try {
robotsEntry = robots.getEntry(url, thisAgents);
} catch (final IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay); waiting = Math.max(waiting, robotsDelay);
s.append(", robots.delay = ").append(robotsDelay); s.append(", robots.delay = ").append(robotsDelay);
@ -262,26 +246,24 @@ public class Latency {
private int count; private int count;
private final String host; private final String host;
private long robotsMinDelay; private long robotsMinDelay;
public Host(final String host, final long time) { private Host(final String host, final long time) {
this(host, time, 0);
}
private Host(final String host, final long time, long robotsMinDelay) {
this.host = host; this.host = host;
this.timeacc = time; this.timeacc = time;
this.count = 1; this.count = 1;
this.lastacc = System.currentTimeMillis(); this.lastacc = System.currentTimeMillis();
this.robotsMinDelay = 0; this.robotsMinDelay = robotsMinDelay;
} }
public void update(final long time) { private void update(final long time) {
this.lastacc = System.currentTimeMillis(); this.lastacc = System.currentTimeMillis();
this.timeacc += Math.min(30000, time); this.timeacc += Math.min(30000, time);
this.count++; this.count++;
} }
public void update() { private void update() {
this.lastacc = System.currentTimeMillis(); this.lastacc = System.currentTimeMillis();
} }
public void slowdown() {
this.lastacc = System.currentTimeMillis();
this.timeacc = Math.min(60000, average() * 2);
this.count = 1;
}
public int count() { public int count() {
return this.count; return this.count;
} }
@ -294,14 +276,11 @@ public class Latency {
public String host() { public String host() {
return this.host; return this.host;
} }
public void robotsDelay(final long ur) {
this.robotsMinDelay = ur;
}
public long robotsDelay() { public long robotsDelay() {
return this.robotsMinDelay; return this.robotsMinDelay;
} }
public long flux(final long range) { public long flux(final long range) {
return this.count >= 1000 ? range * Math.min(5000, this.count) / 1000 : range / (1000 - this.count); return this.count >= 10000 ? range * Math.min(5000, this.count) / 10000 : range / (10000 - this.count);
} }
} }

@ -51,15 +51,15 @@ public class NoticedURL {
LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD; LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD;
} }
public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain private static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
private Balancer coreStack; // links found by crawling to depth-1 private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth private Balancer limitStack; // links found by crawling at target depth
private Balancer remoteStack; // links from remote crawl orders private Balancer remoteStack; // links from remote crawl orders
private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry
public NoticedURL( protected NoticedURL(
final File cachePath, final File cachePath,
final Set<String> myAgentIDs, final Set<String> myAgentIDs,
final boolean useTailCache, final boolean useTailCache,
@ -87,7 +87,7 @@ public class NoticedURL {
this.noloadStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); this.noloadStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
} }
public void clear() { protected void clear() {
Log.logInfo("NoticedURL", "CLEARING ALL STACKS"); Log.logInfo("NoticedURL", "CLEARING ALL STACKS");
this.coreStack.clear(); this.coreStack.clear();
this.limitStack.clear(); this.limitStack.clear();
@ -95,7 +95,7 @@ public class NoticedURL {
this.noloadStack.clear(); this.noloadStack.clear();
} }
public synchronized void close() { protected synchronized void close() {
Log.logInfo("NoticedURL", "CLOSING ALL STACKS"); Log.logInfo("NoticedURL", "CLOSING ALL STACKS");
if (this.coreStack != null) { if (this.coreStack != null) {
this.coreStack.close(); this.coreStack.close();
@ -158,7 +158,7 @@ public class NoticedURL {
} }
} }
public boolean existsInStack(final byte[] urlhashb) { protected boolean existsInStack(final byte[] urlhashb) {
return return
this.coreStack.has(urlhashb) || this.coreStack.has(urlhashb) ||
this.limitStack.has(urlhashb) || this.limitStack.has(urlhashb) ||
@ -193,7 +193,7 @@ public class NoticedURL {
} }
} }
public Request get(final byte[] urlhash) { protected Request get(final byte[] urlhash) {
Request entry = null; Request entry = null;
try {if ((entry = this.noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = this.noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = this.coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = this.coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
@ -235,7 +235,7 @@ public class NoticedURL {
/** /**
* get a list of domains that are currently maintained as domain stacks * get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stacks * @return a map of clear text strings of host names to two integers: the size of the domain stacks and the access delta time
*/ */
public Map<String, Integer[]> getDomainStackHosts(final StackType stackType) { public Map<String, Integer[]> getDomainStackHosts(final StackType stackType) {
switch (stackType) { switch (stackType) {
@ -247,20 +247,6 @@ public class NoticedURL {
} }
} }
/**
* get a list of domains that are currently maintained as domain stacks
* @return a collection of clear text strings of host names
*/
public long getDomainSleepTime(final StackType stackType, final RobotsTxt robots, final CrawlSwitchboard cs, Request crawlEntry) {
switch (stackType) {
case LOCAL: return this.coreStack.getDomainSleepTime(cs, robots, crawlEntry);
case GLOBAL: return this.limitStack.getDomainSleepTime(cs, robots, crawlEntry);
case REMOTE: return this.remoteStack.getDomainSleepTime(cs, robots, crawlEntry);
case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, robots, crawlEntry);
default: return 0;
}
}
/** /**
* get lists of crawl request entries for a specific host * get lists of crawl request entries for a specific host
* @param host * @param host
@ -287,7 +273,7 @@ public class NoticedURL {
} }
} }
public void shift(final StackType fromStack, final StackType toStack, final CrawlSwitchboard cs, final RobotsTxt robots) { protected void shift(final StackType fromStack, final StackType toStack, final CrawlSwitchboard cs, final RobotsTxt robots) {
try { try {
final Request entry = pop(fromStack, false, cs, robots); final Request entry = pop(fromStack, false, cs, robots);
if (entry != null) { if (entry != null) {

@ -158,7 +158,7 @@ public class FTPLoader {
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail); throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
} }
Latency.update(request.url(), System.currentTimeMillis() - start); Latency.updateAfterLoad(request.url(), System.currentTimeMillis() - start);
return response; return response;
} }

@ -73,7 +73,7 @@ public final class HTTPLoader {
public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType) throws IOException { public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType) throws IOException {
final long start = System.currentTimeMillis(); final long start = System.currentTimeMillis();
final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType); final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType);
Latency.update(entry.url(), System.currentTimeMillis() - start); Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
return doc; return doc;
} }

Loading…
Cancel
Save