enhanced strategy in host browser

limit number of fresh hosts in round robin hashes
pull/402/head
Michael Peter Christen 4 years ago
parent 9be36800a4
commit 63f58e4785

@ -114,7 +114,7 @@ public class HostBalancer implements Balancer {
* return immediately (as large unfinished crawls may take longer to load) * return immediately (as large unfinished crawls may take longer to load)
*/ */
private void init(final boolean async) { private void init(final boolean async) {
if(async) { if(async) {
Thread t = new Thread("HostBalancer.init") { Thread t = new Thread("HostBalancer.init") {
@Override @Override
public void run() { public void run() {
@ -123,9 +123,9 @@ public class HostBalancer implements Balancer {
}; };
t.start(); t.start();
} else { } else {
runInit(); runInit();
} }
} }
/** /**
@ -230,11 +230,11 @@ public class HostBalancer implements Balancer {
return c; return c;
} }
/** /**
* @return true when the URL is queued is this or any other HostBalancer * @return true when the URL is queued is this or any other HostBalancer
* instance (as {@link #depthCache} is shared between all HostBalancer * instance (as {@link #depthCache} is shared between all HostBalancer
* instances) * instances)
*/ */
@Override @Override
public boolean has(final byte[] urlhashb) { public boolean has(final byte[] urlhashb) {
if (depthCache.has(urlhashb)) return true; if (depthCache.has(urlhashb)) return true;
@ -331,14 +331,21 @@ public class HostBalancer implements Balancer {
if (size <= 10) {smallStacksExist = true; break smallsearch;} if (size <= 10) {smallStacksExist = true; break smallsearch;}
} }
} }
if (singletonStacksExist || smallStacksExist) { Set<String> freshhosts = new HashSet<>();
Iterator<String> i = this.roundRobinHostHashes.iterator(); Iterator<String> i = this.roundRobinHostHashes.iterator();
smallstacks: while (i.hasNext()) { smallstacks: while (i.hasNext()) {
if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left
String s = i.next(); String hosthash = i.next();
HostQueue hq = this.queues.get(s); HostQueue hq = this.queues.get(hosthash);
if (hq == null) {i.remove(); continue smallstacks;} if (hq == null) {i.remove(); continue smallstacks;}
int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), s, robots, ClientIdentification.yacyInternetCrawlerAgent); int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), hosthash, robots, ClientIdentification.yacyInternetCrawlerAgent);
if (delta == Integer.MIN_VALUE) {
// never-crawled hosts; we do not want to have too many of them in here. Loading new hosts means: waiting for robots.txt to load
freshhosts.add(hosthash);
i.remove();
continue smallstacks;
}
if (singletonStacksExist || smallStacksExist) {
if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things
// to protect all small stacks which have a fast throughput, remove all with long waiting time // to protect all small stacks which have a fast throughput, remove all with long waiting time
if (delta >= 1000) {i.remove(); continue smallstacks;} if (delta >= 1000) {i.remove(); continue smallstacks;}
@ -350,6 +357,10 @@ public class HostBalancer implements Balancer {
} }
} }
} }
// put at least one of the fresh hosts back
if (freshhosts.size() > 0) this.roundRobinHostHashes.add(freshhosts.iterator().next());
// result
if (this.roundRobinHostHashes.size() == 1) { if (this.roundRobinHostHashes.size() == 1) {
if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host"); if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host");
} else { } else {
@ -545,7 +556,7 @@ public class HostBalancer implements Balancer {
@Override @Override
public List<Request> getDomainStackReferences(String host, int maxcount, long maxtime) { public List<Request> getDomainStackReferences(String host, int maxcount, long maxtime) {
if (host == null) { if (host == null) {
return Collections.emptyList(); return Collections.emptyList();
} }
try { try {
HostQueue hq = this.queues.get(DigestURL.hosthash(host, host.startsWith("ftp.") ? 21 : 80)); HostQueue hq = this.queues.get(DigestURL.hosthash(host, host.startsWith("ftp.") ? 21 : 80));

Loading…
Cancel
Save