enhanced crawler speed

pull/1/head
Michael Peter Christen 11 years ago
parent 79809342fa
commit c0da966dfa

@ -26,7 +26,6 @@ import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.data.Latency.Host;
@ -35,7 +34,7 @@ import net.yacy.server.serverSwitch;
public class latency_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
final serverObjects prop = new serverObjects();
//final plasmaSwitchboard sb = (plasmaSwitchboard) env;
@ -43,7 +42,7 @@ public class latency_p {
Map.Entry<String, Host> e;
int c = 0;
Latency.Host host;
ClientIdentification.Agent agent = post == null ? ClientIdentification.yacyInternetCrawlerAgent : ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
//ClientIdentification.Agent agent = post == null ? ClientIdentification.yacyInternetCrawlerAgent : ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
while (i.hasNext()) {
e = i.next();
host = e.getValue();
@ -53,7 +52,7 @@ public class latency_p {
prop.put("domains_" + c + "_count", host.count());
prop.put("domains_" + c + "_average", host.average());
prop.put("domains_" + c + "_robots", host.robotsDelay());
prop.put("domains_" + c + "_flux", host.flux(agent.minimumDelta));
prop.put("domains_" + c + "_flux", 0);
c++;
}
prop.put("domains", c);

@ -156,14 +156,13 @@ public class Latency {
// find the minimum waiting time based on the network domain (local or global)
int waiting = agent.minimumDelta;
// if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting);
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting, host.average() * 3 / 2);
if (agent.minimumDelta > ClientIdentification.minimumLocalDeltaInit) {
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting * 3 / 2, host.average() / 2);
}
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
@ -198,16 +197,17 @@ public class Latency {
boolean local = url.isLocal();
int waiting = agent.minimumDelta;
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if (MultiProtocolURL.isCGI(url.getFileName())) waiting = waiting * 2;
// if we have accessed the domain many times, get slower (the flux factor)
if (!local) waiting += host.flux(waiting);
// use the access latency as rule how fast we can access the server
if (!local) waiting = Math.max(waiting, host.average() * 3 / 2);
if (!local && agent.minimumDelta > ClientIdentification.minimumLocalDeltaInit) {
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if (MultiProtocolURL.isCGI(url.getFileName())) {
waiting = waiting * 3 / 2;
} else {
// use the access latency as rule how fast we can access the server
waiting = Math.max(waiting, host.average() / 2);
}
}
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
@ -237,11 +237,6 @@ public class Latency {
// which creates a lot of unwanted IO on target site
if (MultiProtocolURL.isCGI(url.getFileName())) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
// if we have accessed the domain many times, get slower (the flux factor)
int flux = host.flux(waiting);
waiting += flux;
s.append(", flux = ").append(flux);
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
@ -337,9 +332,6 @@ public class Latency {
public long robotsDelay() {
return this.robotsMinDelay;
}
public int flux(final int range) {
return this.count.get() >= 10000 ? range * Math.min(5000, this.count.get()) / 10000 : range / (10000 - this.count.get());
}
}
}

Loading…
Cancel
Save