enhaced data structures for balancer and latency computation which

should produce a bit better prognosis about forced waiting times.
pull/1/head
Michael Peter Christen 12 years ago
parent ac9540dfb6
commit 0fe8be7981

@ -121,7 +121,7 @@ public class IndexCreateQueues_p {
prop.put("crawler_embed_deletepattern", deletepattern);
prop.put("crawler_embed_queuename", stackType.name());
final Map<String, Integer[]> hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType);
final Map<String, Integer[]> hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType, sb.robots);
int hc = 0;
for (Map.Entry<String, Integer[]> host: hosts.entrySet()) {

@ -76,13 +76,22 @@ public class Balancer {
private BufferedObjectIndex urlFileIndex;
// class variables computed during operation
private final ConcurrentMap<String, HandleSet> domainStacks; // a map from host name to lists with url hashs
private final ConcurrentMap<String, HostHandles> domainStacks; // a map from host name to lists with url hashs
private final HandleSet double_push_check; // for debugging
private long lastDomainStackFill;
private int domStackInitSize;
private final List<Map.Entry<String, byte[]>> zeroWaitingCandidates;
private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting
private static class HostHandles {
public String hosthash;
public HandleSet handleSet;
public HostHandles(final String hosthash, final HandleSet handleSet) {
this.hosthash = hosthash;
this.handleSet = handleSet;
}
}
public Balancer(
final File cachePath,
final String stackname,
@ -92,7 +101,7 @@ public class Balancer {
final boolean useTailCache,
final boolean exceed134217727) {
this.cacheStacksPath = cachePath;
this.domainStacks = new ConcurrentHashMap<String, HandleSet>();
this.domainStacks = new ConcurrentHashMap<String, HostHandles>();
this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta;
this.myAgentIDs = myAgentIDs;
@ -204,10 +213,10 @@ public class Balancer {
assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s;
// iterate through the domain stacks
final Iterator<Map.Entry<String, HandleSet>> q = this.domainStacks.entrySet().iterator();
final Iterator<Map.Entry<String, HostHandles>> q = this.domainStacks.entrySet().iterator();
HandleSet stack;
while (q.hasNext()) {
stack = q.next().getValue();
stack = q.next().getValue().handleSet;
for (final byte[] handle: urlHashes) stack.remove(handle);
if (stack.isEmpty()) q.remove();
}
@ -242,8 +251,8 @@ public class Balancer {
private boolean domainStacksNotEmpty() {
if (this.domainStacks == null) return false;
synchronized (this.domainStacks) {
for (final HandleSet l: this.domainStacks.values()) {
if (!l.isEmpty()) return true;
for (final HostHandles l: this.domainStacks.values()) {
if (!l.handleSet.isEmpty()) return true;
}
}
return false;
@ -285,11 +294,11 @@ public class Balancer {
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time}
*/
public Map<String, Integer[]> getDomainStackHosts() {
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
int size = entry.getValue().size();
int delta = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
for (Map.Entry<String, HostHandles> entry: this.domainStacks.entrySet()) {
int size = entry.getValue().handleSet.size();
int delta = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
map.put(entry.getKey(), new Integer[]{size, delta});
}
return map;
@ -333,8 +342,10 @@ public class Balancer {
* @return a list of crawl loader requests
*/
public List<Request> getDomainStackReferences(String host, int maxcount) {
HandleSet domainList = this.domainStacks.get(host);
if (domainList == null || domainList.isEmpty()) return new ArrayList<Request>(0);
HostHandles hh = this.domainStacks.get(host);
if (hh == null) return new ArrayList<Request>(0);
HandleSet domainList = hh.handleSet;
if (domainList.isEmpty()) return new ArrayList<Request>(0);
ArrayList<Request> cel = new ArrayList<Request>(maxcount);
for (int i = 0; i < maxcount; i++) {
if (domainList.size() <= i) break;
@ -358,16 +369,17 @@ public class Balancer {
return cel;
}
private void pushHashToDomainStacks(String host, final byte[] urlhash) throws SpaceExceededException {
private void pushHashToDomainStacks(String host, String hosthash, final byte[] urlhash) throws SpaceExceededException {
// extend domain stack
if (host == null) host = Domains.LOCALHOST;
HandleSet domainList = this.domainStacks.get(host);
if (domainList == null) {
HostHandles hh = this.domainStacks.get(host);
if (hh == null) {
// create new list
domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1);
HandleSet domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1);
domainList.put(urlhash);
this.domainStacks.put(host, domainList);
this.domainStacks.put(host, new HostHandles(hosthash, domainList));
} else {
HandleSet domainList = hh.handleSet;
// extend existent domain list
domainList.put(urlhash);
}
@ -376,11 +388,12 @@ public class Balancer {
private void removeHashFromDomainStacks(String host, final byte[] urlhash) {
// reduce domain stack
if (host == null) host = Domains.LOCALHOST;
final HandleSet domainList = this.domainStacks.get(host);
if (domainList == null) {
HostHandles hh = this.domainStacks.get(host);
if (hh == null) {
this.domainStacks.remove(host);
return;
}
HandleSet domainList = hh.handleSet;
domainList.remove(urlhash);
if (domainList.isEmpty()) this.domainStacks.remove(host);
}
@ -495,26 +508,24 @@ public class Balancer {
}
// iterate over the domain stacks
final Iterator<Map.Entry<String, HandleSet>> i = this.domainStacks.entrySet().iterator();
Map.Entry<String, HandleSet> entry;
long smallestWaiting = Long.MAX_VALUE;
byte[] besturlhash = null;
String besthost = null;
OrderedScoreMap<Map.Entry<String, byte[]>> nextZeroCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
int newCandidatesForward = 10;
final Iterator<Map.Entry<String, HostHandles>> i = this.domainStacks.entrySet().iterator();
Map.Entry<String, HostHandles> entry;
OrderedScoreMap<Map.Entry<String, byte[]>> nextZeroCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
OrderedScoreMap<Map.Entry<String, byte[]>> failoverCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
int newCandidatesForward = 1;
while (i.hasNext() && nextZeroCandidates.size() < 1000) {
entry = i.next();
// clean up empty entries
if (entry.getValue().isEmpty()) {
if (entry.getValue().handleSet.isEmpty()) {
i.remove();
continue;
}
final byte[] urlhash = entry.getValue().getOne(0);
final byte[] urlhash = entry.getValue().handleSet.getOne(0);
if (urlhash == null) continue;
long w;
int w;
Row.Entry rowEntry;
try {
rowEntry = this.urlFileIndex.get(urlhash, false);
@ -526,50 +537,55 @@ public class Balancer {
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
} catch (IOException e1) {
w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
w = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
}
if (w <= 0) {
if (w == Integer.MIN_VALUE && newCandidatesForward > 0) {
// give new domains a chance, but not too much; otherwise a massive downloading of robots.txt from too much domains (dns lock!) will more likely block crawling
newCandidatesForward--;
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 1000);
if (w == Integer.MIN_VALUE) {
if (newCandidatesForward-- > 0) {
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 10000);
} else {
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 0);
}
} else {
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), entry.getValue().size());
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), entry.getValue().handleSet.size());
}
}
if (w < smallestWaiting || (w == smallestWaiting && this.random.nextBoolean())) {
smallestWaiting = w;
besturlhash = urlhash;
besthost = entry.getKey();
} else {
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), w);
}
}
Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
if (besturlhash == null) {
Log.logInfo("Balancer", "*** getbest: besturlhash == null");
return null; // this should never happen
if (!nextZeroCandidates.isEmpty()) {
// take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates
int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3);
Iterator<Map.Entry<String, byte[]>> k = nextZeroCandidates.keys(false);
while (k.hasNext() && pick-- > 0) {
this.zeroWaitingCandidates.add(k.next());
}
Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
return pickFromZeroWaiting();
}
// best case would be, if we have some zeroWaitingCandidates,
// then we select that one with the largest stack
if (nextZeroCandidates.isEmpty()) {
if (!failoverCandidates.isEmpty()) {
// bad luck: just take that one with least waiting
removeHashFromDomainStacks(besthost, besturlhash);
Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost);
return besturlhash;
}
Iterator<Map.Entry<String, byte[]>> k = failoverCandidates.keys(true);
String besthost;
byte[] besturlhash;
Map.Entry<String, byte[]> hosthash;
while (k.hasNext()) {
hosthash = k.next();
besthost = hosthash.getKey();
besturlhash = hosthash.getValue();
removeHashFromDomainStacks(besthost, besturlhash);
Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost);
return besturlhash;
}
}
// now take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates
int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3);
Iterator<Map.Entry<String, byte[]>> k = nextZeroCandidates.keys(false);
while (k.hasNext() && pick-- > 0) {
this.zeroWaitingCandidates.add(k.next());
}
Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
return pickFromZeroWaiting();
Log.logInfo("Balancer", "*** getbest: besturlhash == null");
return null; // this should never happen
}
}
@ -579,8 +595,8 @@ public class Balancer {
byte[] hash = null;
while (this.zeroWaitingCandidates.size() > 0) {
Map.Entry<String, byte[]> z = this.zeroWaitingCandidates.remove(this.random.nextInt(this.zeroWaitingCandidates.size()));
HandleSet hs = this.domainStacks.get(z.getKey());
if (hs == null) continue;
HostHandles hh = this.domainStacks.get(z.getKey());
if (hh == null) continue;
host = z.getKey(); if (host == null) continue;
hash = z.getValue(); if (hash == null) continue;
removeHashFromDomainStacks(host, hash);
@ -604,6 +620,7 @@ public class Balancer {
String host;
Request request;
int count = 0;
long timeout = System.currentTimeMillis() + 5000;
while (i.hasNext()) {
handle = i.next();
final Row.Entry entry = this.urlFileIndex.get(handle, false);
@ -611,12 +628,12 @@ public class Balancer {
request = new Request(entry);
host = request.url().getHost();
try {
pushHashToDomainStacks(host, handle);
pushHashToDomainStacks(host, request.url().hosthash(), handle);
} catch (final SpaceExceededException e) {
break;
}
count++;
if (this.domainStacks.size() >= 100 || (!this.domainStacks.isEmpty() && count > 600 * this.domainStacks.size())) break;
if (this.domainStacks.size() >= 1000 || count >= 100000 || System.currentTimeMillis() > timeout) break;
}
Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms");
this.domStackInitSize = this.domainStacks.size();

@ -32,6 +32,7 @@ import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.MemoryControl;
@ -47,14 +48,15 @@ public class Latency {
* @param url
* @param time the time to load the file in milliseconds
*/
public static void updateAfterLoad(final MultiProtocolURI url, final long time) {
public static void updateAfterLoad(final DigestURI url, final long time) {
final String host = url.getHost();
if (host == null) return;
Host h = map.get(host);
String hosthash = url.hosthash();
Host h = map.get(hosthash);
if (h == null) {
h = new Host(host, time);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h);
map.put(hosthash, h);
} else {
h.update(time);
}
@ -65,23 +67,24 @@ public class Latency {
* @param url
* @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist
*/
public static void updateAfterSelection(final MultiProtocolURI url, final long robotsCrawlDelay) {
public static void updateAfterSelection(final DigestURI url, final long robotsCrawlDelay) {
final String host = url.getHost();
if (host == null) return;
Host h = map.get(host);
String hosthash = url.hosthash();
Host h = map.get(hosthash);
if (h == null) {
h = new Host(host, DEFAULT_AVERAGE, robotsCrawlDelay);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h);
map.put(hosthash, h);
} else {
h.update();
}
}
private static Host host(final MultiProtocolURI url) {
private static Host host(final DigestURI url) {
final String host = url.getHost();
if (host == null) return null;
return map.get(host);
return map.get(url.hosthash());
}
public static Iterator<Map.Entry<String, Host>> iterator() {
@ -104,41 +107,58 @@ public class Latency {
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;
}
private static int waitingRobots(final String hostport, final RobotsTxt robots, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
int robotsDelay = 0;
RobotsTxtEntry robotsEntry = robots.getEntry(hostport, thisAgents, fetchOnlineIfNotAvailableOrNotFresh);
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;
}
/**
* guess a minimum waiting time
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
* also the 'isCGI' property is missing, because the full text of the domain is unknown here
* @param hostname
* @param hosthash
* @param robots
* @param thisAgents
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds. The return value may be negative
* which expresses how long the time is over the minimum waiting time.
*/
public static int waitingRemainingGuessed(final String hostname, final int minimumLocalDelta, final int minimumGlobalDelta) {
if (hostname == null) return Integer.MIN_VALUE;
public static int waitingRemainingGuessed(final String hostname, final String hosthash, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before
final Host host = map.get(hostname);
final Host host = map.get(hosthash);
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global)
int waiting = (Domains.isLocal(hostname, null)) ? minimumLocalDelta : minimumGlobalDelta;
// if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting);
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting, host.average() * 3 / 2);
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// find the delay as given by robots.txt on target site
if (robots != null) {
int robotsDelay = waitingRobots(hostname + ":80", robots, thisAgents, false);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
}
return Math.min(60000, waiting) - timeSinceLastAccess;
}
/**
* calculates how long should be waited until the domain can be accessed again
* this follows from:
@ -151,7 +171,7 @@ public class Latency {
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
*/
public static int waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
public static int waitingRemaining(final DigestURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before
final Host host = host(url);
@ -183,9 +203,8 @@ public class Latency {
waiting = Math.max(waiting, robotsDelay);
return Math.min(60000, waiting) - timeSinceLastAccess;
}
public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
public static String waitingRemainingExplain(final DigestURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before
final Host host = host(url);

@ -232,12 +232,12 @@ public class NoticedURL {
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to two integers: the size of the domain stacks and the access delta time
*/
public Map<String, Integer[]> getDomainStackHosts(final StackType stackType) {
public Map<String, Integer[]> getDomainStackHosts(final StackType stackType, RobotsTxt robots) {
switch (stackType) {
case LOCAL: return this.coreStack.getDomainStackHosts();
case GLOBAL: return this.limitStack.getDomainStackHosts();
case REMOTE: return this.remoteStack.getDomainStackHosts();
case NOLOAD: return this.noloadStack.getDomainStackHosts();
case LOCAL: return this.coreStack.getDomainStackHosts(robots);
case GLOBAL: return this.limitStack.getDomainStackHosts(robots);
case REMOTE: return this.remoteStack.getDomainStackHosts(robots);
case NOLOAD: return this.noloadStack.getDomainStackHosts(robots);
default: return null;
}
}

@ -93,12 +93,11 @@ public class RobotsTxt {
public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) {
if (theURL == null) throw new IllegalArgumentException();
if (!theURL.getProtocol().startsWith("http")) return null;
return getEntry(theURL, thisAgents, true);
return getEntry(getHostPort(theURL), thisAgents, true);
}
private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
public RobotsTxtEntry getEntry(final String urlHostPort, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
// this method will always return a non-null value
final String urlHostPort = getHostPort(theURL);
RobotsTxtEntry robotsTxt4Host = null;
Map<String, byte[]> record;
BEncodedHeap robotsTable = null;

@ -159,7 +159,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
}
}
public static String crawlStart(
protected static String crawlStart(
final Switchboard sb,
final DigestURI startURL,
final String urlMustMatch,

@ -293,7 +293,7 @@ public class RemoteSearch extends Thread {
}
}
};
solr.start();
if (targetPeer == null) solr.run(); else solr.start();
return solr;
}

Loading…
Cancel
Save