update to Balancer algorithm:

- create a load list from the current list of known hosts
- do not create this list for each Balancer.pop access
- create the list from those hosts which have a zero-waiting time
- select 1/3 from that list which have the most urls waiting
- get hosts from the wainting list in random order
- fixes for some delta-time computations
- always load all urls from hosts which have never been loaded before
pull/1/head
orbiter 12 years ago
parent 354f0d9acd
commit 8952153ecf

@ -130,7 +130,7 @@ public class IndexCreateQueues_p {
prop.put("crawler_host_" + hc + "_urlsPerHost", urlsPerHost);
prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name());
prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]);
prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1]);
prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1] == Integer.MIN_VALUE ? "not accessed" : Integer.toString(host.getValue()[1]));
List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost);
Seed initiator;

@ -28,11 +28,12 @@ package net.yacy.crawler;
import java.io.File;
import java.io.IOException;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
@ -44,6 +45,7 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.Cache;
@ -78,6 +80,8 @@ public class Balancer {
private final HandleSet double_push_check; // for debugging
private long lastDomainStackFill;
private int domStackInitSize;
private final List<Map.Entry<String, byte[]>> zeroWaitingCandidates;
private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting
public Balancer(
final File cachePath,
@ -94,7 +98,9 @@ public class Balancer {
this.myAgentIDs = myAgentIDs;
this.domStackInitSize = Integer.MAX_VALUE;
this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.zeroWaitingCandidates = new ArrayList<Map.Entry<String, byte[]>>();
this.random = new Random(System.currentTimeMillis());
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
this.cacheStacksPath.mkdirs();
@ -205,6 +211,12 @@ public class Balancer {
for (final byte[] handle: urlHashes) stack.remove(handle);
if (stack.isEmpty()) q.remove();
}
// iterate through zero-waiting map
final Iterator<Map.Entry<String, byte[]>> i = this.zeroWaitingCandidates.iterator();
while (i.hasNext()) {
if (urlHashes.has(i.next().getValue())) i.remove();
}
return removedCounter;
}
@ -274,32 +286,35 @@ public class Balancer {
public Map<String, Integer[]> getDomainStackHosts() {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
map.put(entry.getKey(), new Integer[]{entry.getValue().size(), (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)});
int size = entry.getValue().size();
int delta = (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
map.put(entry.getKey(), new Integer[]{size, delta});
}
return map;
}
/**
* compute the current sleep time for a given crawl entry
* @param cs
* @param crawlEntry
* @return
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Long.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
public long getDomainSleepTime(final CrawlSwitchboard cs, final RobotsTxt robots, Request crawlEntry) {
final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
return getDomainSleepTime(robots, profileEntry, crawlEntry.url());
}
private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) {
if (profileEntry == null) {
return 0;
}
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? 0 : Latency.waitingRemaining(crawlURL, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
private long getRobotsTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = Latency.waitingRobots(crawlURL, robots, this.myAgentIDs); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
/**
* get lists of crawl request entries for a specific host
@ -377,6 +392,7 @@ public class Balancer {
long sleeptime = 0;
Request crawlEntry = null;
CrawlProfile profileEntry = null;
synchronized (this) {
byte[] failhash = null;
while (!this.urlFileIndex.isEmpty()) {
@ -408,7 +424,7 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
return null;
@ -425,6 +441,8 @@ public class Balancer {
}
if (crawlEntry == null) return null;
long robotsTime = getRobotsTime(robots, profileEntry, crawlEntry.url());
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) {
// force a busy waiting here
// in best case, this should never happen if the balancer works propertly
@ -442,96 +460,119 @@ public class Balancer {
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
try {this.wait(1000); } catch (final InterruptedException e) {}
}
Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
}
Latency.update(crawlEntry.url());
return crawlEntry;
}
private byte[] getbest(final RobotsTxt robots) {
// check if we need to get entries from the file index
try {
fillDomainStacks();
} catch (final IOException e) {
Log.logException(e);
}
// iterate over the domain stacks
final Iterator<Map.Entry<String, HandleSet>> i = this.domainStacks.entrySet().iterator();
Map.Entry<String, HandleSet> entry;
long smallestWaiting = Long.MAX_VALUE;
byte[] besturlhash = null;
String besthost = null;
Map<String, byte[]> zeroWaitingCandidates = new HashMap<String, byte[]>();
while (i.hasNext() && zeroWaitingCandidates.size() < 10) {
entry = i.next();
// clean up empty entries
if (entry.getValue().isEmpty()) {
i.remove();
continue;
synchronized (this.zeroWaitingCandidates) {
if (this.zeroWaitingCandidates.size() > 0) {
byte[] urlhash = pickFromZeroWaiting();
if (urlhash != null) return urlhash;
}
final byte[] n = entry.getValue().removeOne();
if (n == null) continue;
long w;
Row.Entry rowEntry;
try {
rowEntry = this.urlFileIndex.get(n, false);
if (rowEntry == null) {
continue;
this.zeroWaitingCandidates.clear();
// check if we need to get entries from the file index
try {
fillDomainStacks();
} catch (final IOException e) {
Log.logException(e);
}
// iterate over the domain stacks
final Iterator<Map.Entry<String, HandleSet>> i = this.domainStacks.entrySet().iterator();
Map.Entry<String, HandleSet> entry;
long smallestWaiting = Long.MAX_VALUE;
byte[] besturlhash = null;
String besthost = null;
OrderedScoreMap<Map.Entry<String, byte[]>> nextZeroCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
while (i.hasNext() && nextZeroCandidates.size() < 1000) {
entry = i.next();
// clean up empty entries
if (entry.getValue().isEmpty()) {
i.remove();
continue;
}
final byte[] urlhash = entry.getValue().getOne(0);
if (urlhash == null) continue;
long w;
Row.Entry rowEntry;
try {
rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) {
continue;
}
Request crawlEntry = new Request(rowEntry);
w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
} catch (IOException e1) {
w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
}
Request crawlEntry = new Request(rowEntry);
w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
} catch (IOException e1) {
w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
}
if (w < smallestWaiting) {
smallestWaiting = w;
besturlhash = n;
besthost = entry.getKey();
if (w <= 0) {
zeroWaitingCandidates.put(besthost, besturlhash);
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), w == Integer.MIN_VALUE ? 1000 /* get new domains a chance */ : entry.getValue().size());
}
}
try {
entry.getValue().put(n); // put entry back, we are checking only
} catch (SpaceExceededException e) {
e.printStackTrace();
}
}
if (w < smallestWaiting || (w == smallestWaiting && this.random.nextBoolean())) {
smallestWaiting = w;
besturlhash = urlhash;
besthost = entry.getKey();
}
}
Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
if (besturlhash == null) {
Log.logInfo("Balancer", "*** getbest: besturlhash == null");
return null; // this should never happen
}
// best case would be, if we have some zeroWaitingCandidates,
// then we select that one with the largest stack
if (nextZeroCandidates.isEmpty()) {
// bad luck: just take that one with least waiting
removeHashFromDomainStacks(besthost, besturlhash);
Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost);
return besturlhash;
}
// now take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates
int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3);
Iterator<Map.Entry<String, byte[]>> k = nextZeroCandidates.keys(false);
while (k.hasNext() && pick-- > 0) {
this.zeroWaitingCandidates.add(k.next());
}
Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
return pickFromZeroWaiting();
}
}
if (besturlhash == null) return null; // worst case
// best case would be, if we have some zeroWaitingCandidates,
// then we select that one with the largest stack
if (!zeroWaitingCandidates.isEmpty()) {
int largestStack = -1;
String largestStackHost = null;
byte[] largestStackHash = null;
for (Map.Entry<String, byte[]> z: zeroWaitingCandidates.entrySet()) {
HandleSet hs = this.domainStacks.get(z.getKey());
if (hs == null || hs.size() <= largestStack) continue;
largestStack = hs.size();
largestStackHost = z.getKey();
largestStackHash = z.getValue();
}
if (largestStackHost != null && largestStackHash != null) {
removeHashFromDomainStacks(largestStackHost, largestStackHash);
//Log.logInfo("Balancer", "*** picked one from largest stack");
return largestStackHash;
}
}
private byte[] pickFromZeroWaiting() {
// by random we choose now either from the largest stack or from any of the other stacks
String host = null;
byte[] hash = null;
while (this.zeroWaitingCandidates.size() > 0) {
Map.Entry<String, byte[]> z = this.zeroWaitingCandidates.remove(this.random.nextInt(this.zeroWaitingCandidates.size()));
HandleSet hs = this.domainStacks.get(z.getKey());
if (hs == null) continue;
host = z.getKey(); if (host == null) continue;
hash = z.getValue(); if (hash == null) continue;
removeHashFromDomainStacks(host, hash);
Log.logInfo("Balancer", "*** getbest: picked a random from the zero-waiting stack: " + host + ", zeroWaitingCandidates.size = " + this.zeroWaitingCandidates.size());
return hash;
}
// default case: just take that one with least waiting
removeHashFromDomainStacks(besthost, besturlhash);
return besturlhash;
Log.logInfo("Balancer", "*** getbest: picking from zero-waiting stack failed!" + " zeroWaitingCandidates.size = " + this.zeroWaitingCandidates.size());
this.zeroWaitingCandidates.clear();
return null;
}
private void fillDomainStacks() throws IOException {
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return;
this.domainStacks.clear();

@ -43,7 +43,12 @@ public class Latency {
// the map is a mapping from host names to host configurations
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
public static void update(final MultiProtocolURI url, final long time) {
/**
* update the latency entry after a host was accessed to load a file
* @param url
* @param time the time to load the file in milliseconds
*/
public static void updateAfterLoad(final MultiProtocolURI url, final long time) {
final String host = url.getHost();
if (host == null) return;
Host h = map.get(host);
@ -56,12 +61,17 @@ public class Latency {
}
}
public static void update(final MultiProtocolURI url) {
/**
* update the latency entry after a host was selected for queueing into the loader
* @param url
* @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist
*/
public static void updateAfterSelection(final MultiProtocolURI url, final long robotsCrawlDelay) {
final String host = url.getHost();
if (host == null) return;
Host h = map.get(host);
if (h == null) {
h = new Host(host, DEFAULT_AVERAGE);
h = new Host(host, DEFAULT_AVERAGE, robotsCrawlDelay);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h);
} else {
@ -69,51 +79,38 @@ public class Latency {
}
}
public static void slowdown(final MultiProtocolURI url) {
final String host = url.getHost();
if (host == null) return;
Host h = map.get(host);
if (h == null) {
h = new Host(host, DEFAULT_AVERAGE);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h);
} else {
h.slowdown();
}
}
public static Host host(final MultiProtocolURI url) {
private static Host host(final MultiProtocolURI url) {
final String host = url.getHost();
if (host == null) return null;
return map.get(host);
}
public static int average(final MultiProtocolURI url) {
final String host = url.getHost();
if (host == null) return 0;
final Host h = map.get(host);
if (h == null) return 0;
return h.average();
}
public static Iterator<Map.Entry<String, Host>> iterator() {
return map.entrySet().iterator();
}
/**
* calculate the time since the last access of the domain as referenced by the url hash
* @param urlhash
* @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before
* Return the waiting time demanded by the robots.txt file of the target host.
* A special case is, if the remote host has a special crawl-delay assignment for
* this crawler with 0. This causes that a -1 is returned
* @param url
* @param robots
* @param thisAgents
* @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights
*/
public static long lastAccessDelta(final MultiProtocolURI url) {
final Latency.Host host = Latency.host(url);
if (host == null) return Long.MAX_VALUE; // never accessed
return System.currentTimeMillis() - host.lastacc();
public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
long robotsDelay = 0;
RobotsTxtEntry robotsEntry;
try {
robotsEntry = robots.getEntry(url, thisAgents);
} catch (final IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;
}
/**
* guess a minimum waiting time
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
@ -125,11 +122,11 @@ public class Latency {
* which expresses how long the time is over the minimum waiting time.
*/
public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) {
if (hostname == null) return Long.MIN_VALUE;
if (hostname == null) return Integer.MIN_VALUE;
// first check if the domain was _ever_ accessed before
final Host host = map.get(hostname);
if (host == null) return Long.MIN_VALUE; // no delay if host is new
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global)
final boolean local = Domains.isLocal(hostname, null);
@ -139,14 +136,15 @@ public class Latency {
// if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting, host.average() * 2);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
return Math.max(0, Math.min(60000, waiting) - timeSinceLastAccess);
return Math.min(60000, waiting) - timeSinceLastAccess;
}
/**
@ -159,13 +157,13 @@ public class Latency {
* - and a given minimum access time as given in robots.txt
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
* @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
*/
public static long waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before
final Host host = host(url);
if (host == null) return Long.MIN_VALUE; // no delay if host is new
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal();
@ -185,22 +183,15 @@ public class Latency {
// consider so many external accesses
waiting = Math.max(waiting, host.average() * 2);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// find the delay as given by robots.txt on target site
long robotsDelay = 0;
RobotsTxtEntry robotsEntry;
try {
robotsEntry = robots.getEntry(url, thisAgents);
} catch (final IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return 0; // no limits if granted exclusively for this peer
long robotsDelay = waitingRobots(url, robots, thisAgents);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
return Math.max(0, Math.min(60000, waiting) - timeSinceLastAccess);
return Math.min(60000, waiting) - timeSinceLastAccess;
}
@ -235,15 +226,8 @@ public class Latency {
waiting = Math.max(waiting, host.average() * 2);
// find the delay as given by robots.txt on target site
long robotsDelay = 0;
RobotsTxtEntry robotsEntry;
try {
robotsEntry = robots.getEntry(url, thisAgents);
} catch (final IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
long robotsDelay = waitingRobots(url, robots, thisAgents);
if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
s.append(", robots.delay = ").append(robotsDelay);
@ -262,26 +246,24 @@ public class Latency {
private int count;
private final String host;
private long robotsMinDelay;
public Host(final String host, final long time) {
private Host(final String host, final long time) {
this(host, time, 0);
}
private Host(final String host, final long time, long robotsMinDelay) {
this.host = host;
this.timeacc = time;
this.count = 1;
this.lastacc = System.currentTimeMillis();
this.robotsMinDelay = 0;
this.robotsMinDelay = robotsMinDelay;
}
public void update(final long time) {
private void update(final long time) {
this.lastacc = System.currentTimeMillis();
this.timeacc += Math.min(30000, time);
this.count++;
}
public void update() {
private void update() {
this.lastacc = System.currentTimeMillis();
}
public void slowdown() {
this.lastacc = System.currentTimeMillis();
this.timeacc = Math.min(60000, average() * 2);
this.count = 1;
}
public int count() {
return this.count;
}
@ -294,14 +276,11 @@ public class Latency {
public String host() {
return this.host;
}
public void robotsDelay(final long ur) {
this.robotsMinDelay = ur;
}
public long robotsDelay() {
return this.robotsMinDelay;
}
public long flux(final long range) {
return this.count >= 1000 ? range * Math.min(5000, this.count) / 1000 : range / (1000 - this.count);
return this.count >= 10000 ? range * Math.min(5000, this.count) / 10000 : range / (10000 - this.count);
}
}

@ -51,15 +51,15 @@ public class NoticedURL {
LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD;
}
public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
private static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth
private Balancer remoteStack; // links from remote crawl orders
private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry
public NoticedURL(
protected NoticedURL(
final File cachePath,
final Set<String> myAgentIDs,
final boolean useTailCache,
@ -87,7 +87,7 @@ public class NoticedURL {
this.noloadStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
}
public void clear() {
protected void clear() {
Log.logInfo("NoticedURL", "CLEARING ALL STACKS");
this.coreStack.clear();
this.limitStack.clear();
@ -95,7 +95,7 @@ public class NoticedURL {
this.noloadStack.clear();
}
public synchronized void close() {
protected synchronized void close() {
Log.logInfo("NoticedURL", "CLOSING ALL STACKS");
if (this.coreStack != null) {
this.coreStack.close();
@ -158,7 +158,7 @@ public class NoticedURL {
}
}
public boolean existsInStack(final byte[] urlhashb) {
protected boolean existsInStack(final byte[] urlhashb) {
return
this.coreStack.has(urlhashb) ||
this.limitStack.has(urlhashb) ||
@ -193,7 +193,7 @@ public class NoticedURL {
}
}
public Request get(final byte[] urlhash) {
protected Request get(final byte[] urlhash) {
Request entry = null;
try {if ((entry = this.noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = this.coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
@ -235,7 +235,7 @@ public class NoticedURL {
/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stacks
* @return a map of clear text strings of host names to two integers: the size of the domain stacks and the access delta time
*/
public Map<String, Integer[]> getDomainStackHosts(final StackType stackType) {
switch (stackType) {
@ -247,20 +247,6 @@ public class NoticedURL {
}
}
/**
* get a list of domains that are currently maintained as domain stacks
* @return a collection of clear text strings of host names
*/
public long getDomainSleepTime(final StackType stackType, final RobotsTxt robots, final CrawlSwitchboard cs, Request crawlEntry) {
switch (stackType) {
case LOCAL: return this.coreStack.getDomainSleepTime(cs, robots, crawlEntry);
case GLOBAL: return this.limitStack.getDomainSleepTime(cs, robots, crawlEntry);
case REMOTE: return this.remoteStack.getDomainSleepTime(cs, robots, crawlEntry);
case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, robots, crawlEntry);
default: return 0;
}
}
/**
* get lists of crawl request entries for a specific host
* @param host
@ -287,7 +273,7 @@ public class NoticedURL {
}
}
public void shift(final StackType fromStack, final StackType toStack, final CrawlSwitchboard cs, final RobotsTxt robots) {
protected void shift(final StackType fromStack, final StackType toStack, final CrawlSwitchboard cs, final RobotsTxt robots) {
try {
final Request entry = pop(fromStack, false, cs, robots);
if (entry != null) {

@ -158,7 +158,7 @@ public class FTPLoader {
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
}
Latency.update(request.url(), System.currentTimeMillis() - start);
Latency.updateAfterLoad(request.url(), System.currentTimeMillis() - start);
return response;
}

@ -73,7 +73,7 @@ public final class HTTPLoader {
public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType) throws IOException {
final long start = System.currentTimeMillis();
final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType);
Latency.update(entry.url(), System.currentTimeMillis() - start);
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
return doc;
}

Loading…
Cancel
Save