reverted crawling strategy from latest commit.

pull/1/head
Michael Peter Christen 11 years ago
parent c0da966dfa
commit 77531850b5

@ -741,6 +741,16 @@ crawler.file.maxFileSize=100000000
# maximum number of crawler threads
crawler.MaxActiveThreads = 200
# maximum number of same hosts in crawler threads
crawler.MaxSameHostInQueue = 20
# default latency is the start value of the average of remote server response time
crawler.defaultAverageLatency = 500
# the latency factor is a factor that is applied to the average remote server latency.
# The result is the minimum remote server access delay time
crawler.latencyFactor = 0.5
# maximum size of indexing queue
indexer.slots = 100

@ -306,7 +306,7 @@ public class PerformanceQueues_p {
// table thread pool settings
prop.put("pool_0_name","Crawler Pool");
prop.put("pool_0_maxActive", sb.getConfigLong("crawler.MaxActiveThreads", 0));
prop.put("pool_0_maxActive", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 0));
prop.put("pool_0_numActive",sb.crawlQueues.workerSize());
final YaCyHttpServer httpd = sb.getHttpServer();

@ -26,6 +26,7 @@ import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.data.Latency.Host;
@ -34,7 +35,7 @@ import net.yacy.server.serverSwitch;
public class latency_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
final serverObjects prop = new serverObjects();
//final plasmaSwitchboard sb = (plasmaSwitchboard) env;
@ -42,7 +43,7 @@ public class latency_p {
Map.Entry<String, Host> e;
int c = 0;
Latency.Host host;
//ClientIdentification.Agent agent = post == null ? ClientIdentification.yacyInternetCrawlerAgent : ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
ClientIdentification.Agent agent = post == null ? ClientIdentification.yacyInternetCrawlerAgent : ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
while (i.hasNext()) {
e = i.next();
host = e.getValue();
@ -52,7 +53,7 @@ public class latency_p {
prop.put("domains_" + c + "_count", host.count());
prop.put("domains_" + c + "_average", host.average());
prop.put("domains_" + c + "_robots", host.robotsDelay());
prop.put("domains_" + c + "_flux", 0);
prop.put("domains_" + c + "_flux", host.flux(agent.minimumDelta));
c++;
}
prop.put("domains", c);

@ -144,6 +144,27 @@ public class CrawlQueues {
}
return null;
}
/**
* count the number of same host names in the worker
* @param host
* @return
*/
public int hostcount(final String host) {
if (host == null || host.length() == 0) return 0;
int c = 0;
final int timeout = (int) this.sb.getConfigLong("crawler.clientTimeout", 10000);
for (final Loader worker: this.workers.values()) {
if (worker.isAlive()) {
if (worker.age() > timeout) {
try {worker.interrupt();} catch (Throwable e) {}
} else if (host.equals(worker.request.url().getHost())) {
c++;
}
}
}
return c;
}
public void removeURL(final byte[] hash) {
assert hash != null && hash.length == 12;
@ -180,8 +201,8 @@ public class CrawlQueues {
// wait for all workers to finish
final int timeout = (int) this.sb.getConfigLong("crawler.clientTimeout", 10000);
for (final Loader w: this.workers.values()) {
if (w.age() > timeout) {
w.interrupt();
if (w.isAlive() && w.age() > timeout) {
try {w.interrupt();} catch (Throwable e) {}
}
}
}

@ -36,15 +36,22 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.Switchboard;
public class Latency {
private final static int DEFAULT_AVERAGE = 300;
private final static int DEFAULT_AVERAGE_LATENCY = 500;
private final static int DEFAULT_MAX_SAME_HOST_IN_QUEUE = 20;
private final static float DEFAULT_LATENCY_FACTOR = 0.5f;
// the map is a mapping from host names to host configurations
private static final int mapMaxSize = 1000;
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
public static int defaultAverageLatency = DEFAULT_AVERAGE_LATENCY;
public static int MaxSameHostInQueue = DEFAULT_MAX_SAME_HOST_IN_QUEUE;
public static float latencyFactor = DEFAULT_LATENCY_FACTOR;
/**
* update the latency entry after a host was selected for queueing into the loader
@ -57,7 +64,7 @@ public class Latency {
String hosthash = url.hosthash();
Host h = map.get(hosthash);
if (h == null) {
h = new Host(host, DEFAULT_AVERAGE, robotsCrawlDelay);
h = new Host(host, defaultAverageLatency, robotsCrawlDelay);
if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear();
map.put(hosthash, h);
}
@ -139,7 +146,6 @@ public class Latency {
/**
* guess a minimum waiting time
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
* also the 'isCGI' property is missing, because the full text of the domain is unknown here
* @param hostname
* @param hosthash
* @param robots
@ -156,12 +162,16 @@ public class Latency {
// find the minimum waiting time based on the network domain (local or global)
int waiting = agent.minimumDelta;
if (agent.minimumDelta > ClientIdentification.minimumLocalDeltaInit) {
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting * 3 / 2, host.average() / 2);
}
// if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting);
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting, (int) (host.average() * latencyFactor));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > MaxSameHostInQueue) waiting += 5000;
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
@ -197,17 +207,14 @@ public class Latency {
boolean local = url.isLocal();
int waiting = agent.minimumDelta;
if (!local && agent.minimumDelta > ClientIdentification.minimumLocalDeltaInit) {
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if (MultiProtocolURL.isCGI(url.getFileName())) {
waiting = waiting * 3 / 2;
} else {
// use the access latency as rule how fast we can access the server
waiting = Math.max(waiting, host.average() / 2);
}
}
// if we have accessed the domain many times, get slower (the flux factor)
if (!local) waiting += host.flux(waiting);
// use the access latency as rule how fast we can access the server
waiting = Math.max(waiting, (int) (host.average() * latencyFactor));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > MaxSameHostInQueue) waiting += 5000;
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
@ -226,22 +233,33 @@ public class Latency {
final Host host = host(url);
if (host == null) return "host " + host + " never accessed before -> Integer.MIN_VALUE"; // no delay if host is new
// find the minimum waiting time based on the network domain (local or global)
boolean local = url.isLocal();
final StringBuilder s = new StringBuilder(50);
// find the minimum waiting time based on the network domain (local or global)
int waiting = agent.minimumDelta;
s.append("minimumDelta = ").append(waiting);
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if (MultiProtocolURL.isCGI(url.getFileName())) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
// if we have accessed the domain many times, get slower (the flux factor)
if (!local) {
int flux = host.flux(waiting);
waiting += flux;
s.append(", flux = ").append(flux);
}
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
s.append(", host.average = ").append(host.average());
waiting = Math.max(waiting, host.average() * 3 / 2);
waiting = Math.max(waiting, (int) (host.average() * latencyFactor));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost());
if (hostcount > MaxSameHostInQueue) {
s.append(", hostcount = ").append(hostcount);
waiting += 5000;
}
// find the delay as given by robots.txt on target site
int robotsDelay = waitingRobots(url, robots, agent);
@ -332,6 +350,9 @@ public class Latency {
public long robotsDelay() {
return this.robotsMinDelay;
}
public int flux(final int range) {
return this.count.get() >= 10000 ? range * Math.min(5000, this.count.get()) / 10000 : range / (10000 - this.count.get());
}
}
}

@ -126,6 +126,7 @@ import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultImages;
import net.yacy.crawler.data.ResultURLs;
@ -824,6 +825,9 @@ public final class Switchboard extends serverSwitch {
getDataPath());
OAIListFriendsLoader.init(this.loader, oaiFriends, ClientIdentification.yacyInternetCrawlerAgent);
this.crawlQueues = new CrawlQueues(this, this.queuesRoot);
Latency.defaultAverageLatency = this.getConfigInt("crawler.defaultAverageLatency", 500);
Latency.latencyFactor = this.getConfigFloat("crawler.latencyFactor", 0.5f);
Latency.MaxSameHostInQueue = this.getConfigInt("crawler.MaxSameHostInQueue", 20);
// on startup, resume all crawls
setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused", "false");

@ -332,6 +332,7 @@ public final class SwitchboardConstants {
* <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
*/
public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
public static final String CRAWLER_SAME_HOST_IN_QUEUE = "crawler.MaxSameHostInQueue";
public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store

Loading…
Cancel
Save