enhancements to crawler

pull/1/head
Michael Peter Christen 11 years ago
parent 232100301c
commit 735a66eff3

@ -304,9 +304,11 @@ public class Balancer {
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) { public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HostHandles> entry: this.domainStacks.entrySet()) { for (Map.Entry<String, HostHandles> entry: this.domainStacks.entrySet()) {
int size = entry.getValue().handleSet.size(); final String hostname = entry.getKey();
int delta = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); final HostHandles hosthandles = entry.getValue();
map.put(entry.getKey(), new Integer[]{size, delta}); int size = hosthandles.handleSet.size();
int delta = Latency.waitingRemainingGuessed(hostname, hosthandles.hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
map.put(hostname, new Integer[]{size, delta});
} }
return map; return map;
} }
@ -429,9 +431,9 @@ public class Balancer {
byte[] failhash = null; byte[] failhash = null;
while (!this.urlFileIndex.isEmpty()) { while (!this.urlFileIndex.isEmpty()) {
byte[] nexthash = getbest(robots); byte[] nexthash = getbest(robots);
if (nexthash == null) return null;
synchronized (this) { synchronized (this) {
if (nexthash == null) return null;
Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash); Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash);
if (rowEntry == null) continue; if (rowEntry == null) continue;
@ -515,43 +517,43 @@ public class Balancer {
int newCandidatesForward = 1; int newCandidatesForward = 1;
while (i.hasNext() && nextZeroCandidates.size() < 1000) { while (i.hasNext() && nextZeroCandidates.size() < 1000) {
entry = i.next(); entry = i.next();
final String hostname = entry.getKey();
final HostHandles hosthandles = entry.getValue();
// clean up empty entries // clean up empty entries
if (entry.getValue().handleSet.isEmpty()) { if (hosthandles.handleSet.isEmpty()) {
i.remove(); i.remove();
continue; continue;
} }
final byte[] urlhash = entry.getValue().handleSet.getOne(0); final byte[] urlhash = hosthandles.handleSet.getOne(0);
if (urlhash == null) continue; if (urlhash == null) continue;
int w; int w;
Row.Entry rowEntry; Row.Entry rowEntry;
try { try {
rowEntry = this.urlFileIndex.get(urlhash, false); rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) { if (rowEntry == null) continue; // may have been deleted there manwhile
continue;
}
Request crawlEntry = new Request(rowEntry); Request crawlEntry = new Request(rowEntry);
w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)); //System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(hostname, this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta)); //System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
} catch (final IOException e1) { } catch (final IOException e1) {
w = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); w = Latency.waitingRemainingGuessed(hostname, hosthandles.hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
} }
if (w <= 0) { if (w <= 0) {
if (w == Integer.MIN_VALUE) { if (w == Integer.MIN_VALUE) {
if (newCandidatesForward-- > 0) { if (newCandidatesForward-- > 0) {
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 10000); nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), 10000);
} else { } else {
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 0); failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), 0);
} }
} else { } else {
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), entry.getValue().handleSet.size()); nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), hosthandles.handleSet.size());
} }
} else { } else {
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), w); failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), w);
} }
} }
//Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size()); //Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
@ -576,7 +578,7 @@ public class Balancer {
Map.Entry<String, byte[]> hosthash; Map.Entry<String, byte[]> hosthash;
while (k.hasNext()) { while (k.hasNext()) {
hosthash = k.next(); hosthash = k.next();
if (failoverCandidates.get(hosthash) > 1000) break; // thats too long; we want a second chance for this! if (failoverCandidates.get(hosthash) > 2000) break; // thats too long; we want a second chance for this!
besthost = hosthash.getKey(); besthost = hosthash.getKey();
besturlhash = hosthash.getValue(); besturlhash = hosthash.getValue();
removeHashFromDomainStacks(besthost, besturlhash); removeHashFromDomainStacks(besthost, besturlhash);

@ -232,12 +232,18 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
@Override @Override
public List<Row.Entry> random(final int count) throws IOException { public List<Row.Entry> random(final int count) throws IOException {
final List<Row.Entry> list = new ArrayList<Row.Entry>(); List<Row.Entry> list0, list1;
synchronized (this.backend) { synchronized (this.backend) {
List<Row.Entry> list0 = this.buffer.random(count); list0 = this.buffer.random(Math.max(1, count / 2));
list.addAll(list0); list1 = this.backend.random(count - list0.size());
list0 = this.backend.random(count - list.size()); }
list.addAll(list0); // multiplex the lists
final List<Row.Entry> list = new ArrayList<Row.Entry>();
Iterator<Row.Entry> i0 = list0.iterator();
Iterator<Row.Entry> i1 = list1.iterator();
while (i0.hasNext() || i1.hasNext()) {
if (i0.hasNext()) list.add(i0.next());
if (i1.hasNext()) list.add(i1.next());
} }
return list; return list;
} }

Loading…
Cancel
Save