enhancements to crawler

pull/1/head
Michael Peter Christen 11 years ago
parent 232100301c
commit 735a66eff3

@ -304,9 +304,11 @@ public class Balancer {
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HostHandles> entry: this.domainStacks.entrySet()) {
int size = entry.getValue().handleSet.size();
int delta = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
map.put(entry.getKey(), new Integer[]{size, delta});
final String hostname = entry.getKey();
final HostHandles hosthandles = entry.getValue();
int size = hosthandles.handleSet.size();
int delta = Latency.waitingRemainingGuessed(hostname, hosthandles.hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
map.put(hostname, new Integer[]{size, delta});
}
return map;
}
@ -429,9 +431,9 @@ public class Balancer {
byte[] failhash = null;
while (!this.urlFileIndex.isEmpty()) {
byte[] nexthash = getbest(robots);
if (nexthash == null) return null;
synchronized (this) {
if (nexthash == null) return null;
Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash);
if (rowEntry == null) continue;
@ -515,43 +517,43 @@ public class Balancer {
int newCandidatesForward = 1;
while (i.hasNext() && nextZeroCandidates.size() < 1000) {
entry = i.next();
final String hostname = entry.getKey();
final HostHandles hosthandles = entry.getValue();
// clean up empty entries
if (entry.getValue().handleSet.isEmpty()) {
if (hosthandles.handleSet.isEmpty()) {
i.remove();
continue;
}
final byte[] urlhash = entry.getValue().handleSet.getOne(0);
final byte[] urlhash = hosthandles.handleSet.getOne(0);
if (urlhash == null) continue;
int w;
Row.Entry rowEntry;
try {
rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) {
continue;
}
if (rowEntry == null) continue; // may have been deleted there manwhile
Request crawlEntry = new Request(rowEntry);
w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(hostname, this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
} catch (final IOException e1) {
w = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
w = Latency.waitingRemainingGuessed(hostname, hosthandles.hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
}
if (w <= 0) {
if (w == Integer.MIN_VALUE) {
if (newCandidatesForward-- > 0) {
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 10000);
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), 10000);
} else {
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 0);
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), 0);
}
} else {
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), entry.getValue().handleSet.size());
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), hosthandles.handleSet.size());
}
} else {
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), w);
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), w);
}
}
//Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
@ -576,7 +578,7 @@ public class Balancer {
Map.Entry<String, byte[]> hosthash;
while (k.hasNext()) {
hosthash = k.next();
if (failoverCandidates.get(hosthash) > 1000) break; // thats too long; we want a second chance for this!
if (failoverCandidates.get(hosthash) > 2000) break; // thats too long; we want a second chance for this!
besthost = hosthash.getKey();
besturlhash = hosthash.getValue();
removeHashFromDomainStacks(besthost, besturlhash);

@ -232,12 +232,18 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
@Override
public List<Row.Entry> random(final int count) throws IOException {
final List<Row.Entry> list = new ArrayList<Row.Entry>();
List<Row.Entry> list0, list1;
synchronized (this.backend) {
List<Row.Entry> list0 = this.buffer.random(count);
list.addAll(list0);
list0 = this.backend.random(count - list.size());
list.addAll(list0);
list0 = this.buffer.random(Math.max(1, count / 2));
list1 = this.backend.random(count - list0.size());
}
// multiplex the lists
final List<Row.Entry> list = new ArrayList<Row.Entry>();
Iterator<Row.Entry> i0 = list0.iterator();
Iterator<Row.Entry> i1 = list1.iterator();
while (i0.hasNext() || i1.hasNext()) {
if (i0.hasNext()) list.add(i0.next());
if (i1.hasNext()) list.add(i1.next());
}
return list;
}

Loading…
Cancel
Save