made number of open files in crawler configurable and increased default

maximum number of open files from 100 to 1000. This number can be
changed with the attribut crawler.onDemandLimit
pull/1/head
orbiter 11 years ago
parent 20cffa34bf
commit d7d38f9135

@ -772,6 +772,14 @@ crawler.defaultAverageLatency = 500
# The result is the minimum remote server access delay time
crawler.latencyFactor = 0.5
# The onDemandLimit is the maximum number of crawl queues that are concurrently opened
# at the same time. If the number of hosts exceeds this number, onDemand queues are opened
# which are opened each time a queue is accessed which creates high IO load. On the other
# hand, having too many entries in onDemandLimit may exceed the maximum number of file
# pointers. You can increase this number in /proc/sys/fs/file-max and adopt it to the number
# defined here
crawler.onDemandLimit = 1000
# maximum size of indexing queue
indexer.slots = 100

@ -68,11 +68,14 @@ public class HostBalancer implements Balancer {
private final boolean exceed134217727;
private final Map<String, HostQueue> queues;
private final Set<String> roundRobinHostHashes;
private final int onDemandLimit;
public HostBalancer(
final File hostsPath,
final int onDemandLimit,
final boolean exceed134217727) {
this.hostsPath = hostsPath;
this.onDemandLimit = onDemandLimit;
this.exceed134217727 = exceed134217727;
// create a stack for newly entered entries
@ -81,7 +84,7 @@ public class HostBalancer implements Balancer {
String[] list = this.hostsPath.list();
for (String address: list) try {
File queuePath = new File(this.hostsPath, address);
HostQueue queue = new HostQueue(queuePath, this.queues.size() > 100, this.exceed134217727);
HostQueue queue = new HostQueue(queuePath, this.queues.size() > this.onDemandLimit, this.exceed134217727);
if (queue.size() == 0) {
queue.close();
queuePath.delete();
@ -210,7 +213,7 @@ public class HostBalancer implements Balancer {
synchronized (this) {
HostQueue queue = this.queues.get(hosthash);
if (queue == null) {
queue = new HostQueue(this.hostsPath, entry.url().getHost(), entry.url().getPort(), this.queues.size() > 100, this.exceed134217727);
queue = new HostQueue(this.hostsPath, entry.url().getHost(), entry.url().getPort(), this.queues.size() > this.onDemandLimit, this.exceed134217727);
this.queues.put(hosthash, queue);
// profile might be null when continue crawls after YaCy restart
robots.ensureExist(entry.url(), profile == null ? ClientIdentification.yacyInternetCrawlerAgent : profile.getAgent(), true); // concurrently load all robots.txt

@ -87,7 +87,7 @@ public class CrawlQueues {
// start crawling management
log.config("Starting Crawling Management");
this.noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727);
this.noticeURL = new NoticedURL(queuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), sb.exceed134217727);
this.errorURL = new ErrorCache(sb.index.fulltext());
this.delegatedURL = new ConcurrentHashMap<String, DigestURL>();
}
@ -104,7 +104,7 @@ public class CrawlQueues {
this.errorURL.clearCache();
this.remoteCrawlProviderHashes.clear();
this.noticeURL.close();
this.noticeURL = new NoticedURL(newQueuePath, this.sb.useTailCache, this.sb.exceed134217727);
this.noticeURL = new NoticedURL(newQueuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), this.sb.exceed134217727);
this.delegatedURL.clear();
}

@ -60,13 +60,13 @@ public class NoticedURL {
protected NoticedURL(
final File cachePath,
@SuppressWarnings("unused") final boolean useTailCache,
final int onDemandLimit,
final boolean exceed134217727) {
ConcurrentLog.info("NoticedURL", "CREATING STACKS at " + cachePath.toString());
this.coreStack = new HostBalancer(new File(cachePath, "CrawlerCoreStacks"), exceed134217727);
this.limitStack = new HostBalancer(new File(cachePath, "CrawlerLimitStacks"), exceed134217727);
this.remoteStack = new HostBalancer(new File(cachePath, "CrawlerRemoteStacks"), exceed134217727);
this.noloadStack = new HostBalancer(new File(cachePath, "CrawlerNoLoadStacks"), exceed134217727);
this.coreStack = new HostBalancer(new File(cachePath, "CrawlerCoreStacks"), onDemandLimit, exceed134217727);
this.limitStack = new HostBalancer(new File(cachePath, "CrawlerLimitStacks"), onDemandLimit, exceed134217727);
this.remoteStack = new HostBalancer(new File(cachePath, "CrawlerRemoteStacks"), onDemandLimit, exceed134217727);
this.noloadStack = new HostBalancer(new File(cachePath, "CrawlerNoLoadStacks"), onDemandLimit, exceed134217727);
}
public void clear() {

Loading…
Cancel
Save