enhanced concurrent loading by using a fixed set of concurrent loader

processes in favor of throwaway-processes. The control mechanism does
less often report a 'queue full' message to the busy loop which then
does not perform a long busy waiting; instead all requests are queued
and new loader processes are started if necessary up to a given limit
(as set before)
pull/1/head
Michael Peter Christen 11 years ago
parent d8f7f47eff
commit bcd9dd9e1d

@ -27,7 +27,10 @@
// javac -classpath .:../classes IndexCreate_p.java // javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.util.Map;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;
@ -42,16 +45,15 @@ public class IndexCreateLoaderQueue_p {
final Switchboard sb = (Switchboard) env; final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
Map<DigestURL, Request> map = sb.crawlQueues.activeWorkerEntries();
if (sb.crawlQueues.workerSize() == 0) { if (map.size() == 0) {
prop.put("loader-set", "0"); prop.put("loader-set", "0");
} else { } else {
prop.put("loader-set", "1"); prop.put("loader-set", "1");
boolean dark = true; boolean dark = true;
final Request[] w = sb.crawlQueues.activeWorkerEntries();
Seed initiator; Seed initiator;
int count = 0; int count = 0;
for (Request element : w) { for (Request element : map.values()) {
if (element == null) continue; if (element == null) continue;
initiator = sb.peers.getConnected((element.initiator() == null) ? "" : ASCII.String(element.initiator())); initiator = sb.peers.getConnected((element.initiator() == null) ? "" : ASCII.String(element.initiator()));

@ -305,7 +305,7 @@ public class PerformanceQueues_p {
// table thread pool settings // table thread pool settings
prop.put("pool_0_name","Crawler Pool"); prop.put("pool_0_name","Crawler Pool");
prop.put("pool_0_maxActive", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 0)); prop.put("pool_0_maxActive", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 0));
prop.put("pool_0_numActive",sb.crawlQueues.workerSize()); prop.put("pool_0_numActive", sb.crawlQueues.activeWorkerEntries().size());
final YaCyHttpServer httpd = sb.getHttpServer(); final YaCyHttpServer httpd = sb.getHttpServer();
prop.put("pool_1_name", "httpd Session Pool"); prop.put("pool_1_name", "httpd Session Pool");

@ -29,7 +29,6 @@
import java.net.InetAddress; import java.net.InetAddress;
import java.util.Date; import java.util.Date;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.Memory; import net.yacy.cora.util.Memory;
@ -337,7 +336,7 @@ public class Status
prop.putNum("connectionsMax", httpd.getMaxSessionCount()); prop.putNum("connectionsMax", httpd.getMaxSessionCount());
// Queue information // Queue information
final int loaderJobCount = sb.crawlQueues.workerSize(); final int loaderJobCount = sb.crawlQueues.activeWorkerEntries().size();
final int loaderMaxCount = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10); final int loaderMaxCount = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
final int loaderPercent = (loaderMaxCount == 0) ? 0 : loaderJobCount * 100 / loaderMaxCount; final int loaderPercent = (loaderMaxCount == 0) ? 0 : loaderJobCount * 100 / loaderMaxCount;
prop.putNum("loaderQueueSize", loaderJobCount); prop.putNum("loaderQueueSize", loaderJobCount);

@ -93,7 +93,7 @@ public class status_p {
prop.putNum("rwipublictextSegmentCount", segment.RWISegmentCount()); prop.putNum("rwipublictextSegmentCount", segment.RWISegmentCount());
// loader queue // loader queue
prop.putNum("loaderSize", sb.crawlQueues.workerSize()); prop.putNum("loaderSize", sb.crawlQueues.activeWorkerEntries().size());
prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)); prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10));
//local crawl queue //local crawl queue

@ -38,6 +38,7 @@ import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.order.NaturalOrder;
@ -596,8 +597,8 @@ public final class CrawlSwitchboard {
if (deletionCandidate.size() == 0) return new HashSet<String>(0); if (deletionCandidate.size() == 0) return new HashSet<String>(0);
} }
// look into the CrawlQueues.worker as well // look into the CrawlQueues.worker as well
Request[] requests = switchboard.crawlQueues.activeWorkerEntries(); Map<DigestURL, Request> map = switchboard.crawlQueues.activeWorkerEntries();
for (Request request: requests) { for (Request request: map.values()) {
deletionCandidate.remove(request.profileHandle()); deletionCandidate.remove(request.profileHandle());
} }
} catch (final Throwable e) { } catch (final Throwable e) {
@ -610,7 +611,7 @@ public final class CrawlSwitchboard {
public boolean allCrawlsFinished(CrawlQueues crawlQueues) { public boolean allCrawlsFinished(CrawlQueues crawlQueues) {
if (!crawlQueues.noticeURL.isEmpty()) return false; if (!crawlQueues.noticeURL.isEmpty()) return false;
// look into the CrawlQueues.worker as well // look into the CrawlQueues.worker as well
if (switchboard.crawlQueues.activeWorkerEntries().length > 0) return false; if (switchboard.crawlQueues.activeWorkerEntries().size() > 0) return false;
return true; return true;
} }

@ -31,8 +31,10 @@ import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
@ -62,10 +64,12 @@ import net.yacy.search.index.ErrorCache;
public class CrawlQueues { public class CrawlQueues {
private final static Request POISON_REQUEST = new Request();
private final static ConcurrentLog log = new ConcurrentLog("CRAWLER"); private final static ConcurrentLog log = new ConcurrentLog("CRAWLER");
private Switchboard sb; private final Switchboard sb;
private Map<Integer, Loader> workers; // mapping from url hash to Worker thread object private final Loader[] worker;
private final ArrayBlockingQueue<Request> workerQueue;
private final ArrayList<String> remoteCrawlProviderHashes; private final ArrayList<String> remoteCrawlProviderHashes;
public NoticedURL noticeURL; public NoticedURL noticeURL;
@ -74,7 +78,9 @@ public class CrawlQueues {
public CrawlQueues(final Switchboard sb, final File queuePath) { public CrawlQueues(final Switchboard sb, final File queuePath) {
this.sb = sb; this.sb = sb;
this.workers = new ConcurrentHashMap<Integer, Loader>(); final int maxWorkers = (int) sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
this.worker = new Loader[maxWorkers];
this.workerQueue = new ArrayBlockingQueue<Request>(200);
this.remoteCrawlProviderHashes = new ArrayList<String>(); this.remoteCrawlProviderHashes = new ArrayList<String>();
// start crawling management // start crawling management
@ -82,12 +88,12 @@ public class CrawlQueues {
this.noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727); this.noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727);
this.errorURL = new ErrorCache(sb.index.fulltext()); this.errorURL = new ErrorCache(sb.index.fulltext());
this.delegatedURL = new ConcurrentHashMap<String, DigestURL>(); this.delegatedURL = new ConcurrentHashMap<String, DigestURL>();
} }
public void relocate(final File newQueuePath) { public void relocate(final File newQueuePath) {
close(); close();
this.workers = new ConcurrentHashMap<Integer, Loader>();
this.remoteCrawlProviderHashes.clear(); this.remoteCrawlProviderHashes.clear();
this.noticeURL = new NoticedURL(newQueuePath, this.sb.useTailCache, this.sb.exceed134217727); this.noticeURL = new NoticedURL(newQueuePath, this.sb.useTailCache, this.sb.exceed134217727);
@ -97,25 +103,28 @@ public class CrawlQueues {
public synchronized void close() { public synchronized void close() {
// wait for all workers to finish // wait for all workers to finish
for (final Loader w: this.workers.values()) { for (int i = 0; i < this.worker.length; i++) {
w.interrupt(); try {this.workerQueue.put(POISON_REQUEST);} catch (InterruptedException e) {}
} }
for (final Loader w: this.workers.values()) { for (final Loader w: this.worker) {
if (w != null && w.isAlive()) {
try { try {
w.join(); w.join(1000);
if (w.isAlive()) w.interrupt();
} catch (final InterruptedException e) { } catch (final InterruptedException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
}
this.noticeURL.close(); this.noticeURL.close();
this.delegatedURL.clear(); this.delegatedURL.clear();
} }
public void clear() { public void clear() {
// wait for all workers to finish // wait for all workers to finish
for (final Loader w: this.workers.values()) w.interrupt(); this.workerQueue.clear();
for (final Loader w: this.workers.values()) try {w.join(10);} catch (final InterruptedException e1) {} for (final Loader w: this.worker) w.interrupt();
this.workers.clear(); for (final Loader w: this.worker) try {w.join(10);} catch (final InterruptedException e1) {}
this.remoteCrawlProviderHashes.clear(); this.remoteCrawlProviderHashes.clear();
this.noticeURL.clear(); this.noticeURL.clear();
this.delegatedURL.clear(); this.delegatedURL.clear();
@ -135,9 +144,9 @@ public class CrawlQueues {
} }
//if (this.noticeURL.existsInStack(hash)) { //if (this.noticeURL.existsInStack(hash)) {
// return HarvestProcess.CRAWLER; // return HarvestProcess.CRAWLER;
//} // this is disabled because it prevents propert crawling of smb shares. The cause is unknown //} // this is disabled because it prevents proper crawling of smb shares. The cause is unknown
for (final Loader worker: this.workers.values()) { for (final Request request: activeWorkerEntries().values()) {
if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) { if (Base64Order.enhancedCoder.equal(request.url().hash(), hash)) {
return HarvestProcess.WORKER; return HarvestProcess.WORKER;
} }
} }
@ -152,16 +161,11 @@ public class CrawlQueues {
public int hostcount(final String host) { public int hostcount(final String host) {
if (host == null || host.length() == 0) return 0; if (host == null || host.length() == 0) return 0;
int c = 0; int c = 0;
final int timeout = (int) this.sb.getConfigLong("crawler.clientTimeout", 10000); for (final DigestURL url: activeWorkerEntries().keySet()) {
for (final Loader worker: this.workers.values()) { if (host.equals(url.getHost())) {
if (worker.isAlive()) {
if (worker.age() > timeout) {
try {worker.interrupt();} catch (Throwable e) {}
} else if (host.equals(worker.request.url().getHost())) {
c++; c++;
} }
} }
}
return c; return c;
} }
@ -180,9 +184,9 @@ public class CrawlQueues {
if (u != null) { if (u != null) {
return u; return u;
} }
for (final Loader w: this.workers.values()) { for (final DigestURL url: activeWorkerEntries().keySet()) {
if (Base64Order.enhancedCoder.equal(w.request.url().hash(), urlhash)) { if (Base64Order.enhancedCoder.equal(url.hash(), urlhash)) {
return w.request.url(); return url;
} }
} }
final Request ne = this.noticeURL.get(urlhash); final Request ne = this.noticeURL.get(urlhash);
@ -192,16 +196,6 @@ public class CrawlQueues {
return null; return null;
} }
public void cleanup() {
// wait for all workers to finish
final int timeout = (int) this.sb.getConfigLong("crawler.clientTimeout", 10000);
for (final Loader w: this.workers.values()) {
if (w.isAlive() && w.age() > timeout) {
try {w.interrupt();} catch (Throwable e) {}
}
}
}
public void freemem() { public void freemem() {
if ((this.errorURL.stackSize() > 1)) { if ((this.errorURL.stackSize() > 1)) {
log.warn("freemem: Cleaning Error-URLs report stack, " log.warn("freemem: Cleaning Error-URLs report stack, "
@ -211,17 +205,16 @@ public class CrawlQueues {
} }
} }
public Request[] activeWorkerEntries() { public Map<DigestURL, Request> activeWorkerEntries() {
synchronized (this.workers) { synchronized (this.worker) {
final Request[] e = new Request[this.workers.size()]; Map<DigestURL, Request> map = new HashMap<DigestURL, Request>();
int i = 0; for (final Loader w: this.worker) {
for (final Loader w: this.workers.values()) { if (w != null) {
if (i >= e.length) { Request r = w.loading();
break; if (r != null) map.put(r.url(), r);
} }
e[i++] = w.request;
} }
return e; return map;
} }
} }
@ -343,18 +336,15 @@ public class CrawlQueues {
if (urlEntry == null || urlEntry.url() == null) { if (urlEntry == null || urlEntry.url() == null) {
CrawlQueues.log.info(stats + ": urlEntry = null"); CrawlQueues.log.info(stats + ": urlEntry = null");
} else { } else {
if (!this.workers.containsKey(Integer.valueOf(urlEntry.hashCode()))) { if (!activeWorkerEntries().containsKey(urlEntry.url())) {
Loader loader = new Loader(urlEntry);
this.workers.put(loader.code, loader);
try { try {
loader.start(); ensureLoaderRunning();
} catch (final OutOfMemoryError e) { this.workerQueue.put(urlEntry);
ConcurrentLog.warn("CrawlQueues", "crawlWorker sequential fail-over: " + e.getMessage()); } catch (InterruptedException e) {
loader.run(); ConcurrentLog.logException(e);
} }
} }
} }
} else { } else {
CrawlQueues.log.severe("Unsupported protocol in URL '" + url.toString()); CrawlQueues.log.severe("Unsupported protocol in URL '" + url.toString());
} }
@ -395,15 +385,9 @@ public class CrawlQueues {
return "stack is empty"; return "stack is empty";
} }
// check the worker threads
final int maxWorkers = (int) this.sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
if (this.workers.size() >= maxWorkers) {
// too many worker threads, try a cleanup
cleanup();
}
// check again // check again
if (this.workers.size() >= maxWorkers) { if (this.workerQueue.remainingCapacity() == 0) {
return "too many workers active: " + this.workers.size(); return "too many workers active: " + this.workerQueue.size();
} }
final String cautionCause = this.sb.onlineCaution(); final String cautionCause = this.sb.onlineCaution();
@ -426,14 +410,10 @@ public class CrawlQueues {
return false; return false;
} }
if (this.workers.size() >= this.sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 20)) {
// try a cleanup
cleanup();
}
// check again // check again
if (this.workers.size() >= this.sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 20)) { if (this.workerQueue.remainingCapacity() == 0) {
if (CrawlQueues.log.isFine()) { if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.workers.size() + "), httpClients = " + ConnectionInfo.getCount()); CrawlQueues.log.fine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed (" + "workerQueue=" + this.workerQueue.size() + "), httpClients = " + ConnectionInfo.getCount());
} }
return false; return false;
} }
@ -609,86 +589,93 @@ public class CrawlQueues {
} }
} }
public int workerSize() { private void ensureLoaderRunning() {
return this.workers.size(); // check if there is at least one loader available
for (int i = 0; i < this.worker.length; i++) {
if (this.worker[i] == null || !this.worker[i].isAlive()) {
this.worker[i] = new Loader();
this.worker[i].start();
return;
}
if (this.worker[i].loading() == null) return;
}
} }
private final class Loader extends Thread { private final class Loader extends Thread {
private Request request; private Request request = null;
private final Integer code; private Loader() {
private final long start;
private final CrawlProfile profile;
private Loader(final Request entry) {
this.start = System.currentTimeMillis();
this.request = entry;
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
this.profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(this.request.profileHandle()));
this.setName("CrawlQueues.Loader(" + entry.url() + ")");
} }
private long age() { public Request loading() {
return System.currentTimeMillis() - this.start; return request;
} }
@Override @Override
public void run() { public void run() {
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
try {
while ((request = CrawlQueues.this.workerQueue.take()) != POISON_REQUEST) {
request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.setName("CrawlQueues.Loader(" + request.url() + ")");
CrawlProfile profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
try { try {
// checking robots.txt for http(s) resources // checking robots.txt for http(s) resources
this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED); request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
RobotsTxtEntry robotsEntry; RobotsTxtEntry robotsEntry;
if ((this.request.url().getProtocol().equals("http") || this.request.url().getProtocol().equals("https")) && if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) &&
(robotsEntry = CrawlQueues.this.sb.robots.getEntry(this.request.url(), this.profile.getAgent())) != null && (robotsEntry = CrawlQueues.this.sb.robots.getEntry(request.url(), profile.getAgent())) != null &&
robotsEntry.isDisallowed(this.request.url())) { robotsEntry.isDisallowed(request.url())) {
//if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt."); //if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.FINAL_ROBOTS_RULE, "denied by robots.txt", -1); CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.FINAL_ROBOTS_RULE, "denied by robots.txt", -1);
this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED); request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED);
} else { } else {
// starting a load from the internet // starting a load from the internet
this.request.setStatus("worker-loading", WorkflowJob.STATUS_RUNNING); request.setStatus("worker-loading", WorkflowJob.STATUS_RUNNING);
String result = null; String result = null;
// load a resource and push queue entry to switchboard queue // load a resource and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred // returns null if everything went fine, a fail reason string if a problem occurred
try { try {
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING); request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final Response response = CrawlQueues.this.sb.loader.load(this.request, profile == null ? CacheStrategy.IFEXIST : profile.cacheStrategy(), BlacklistType.CRAWLER, this.profile.getAgent()); final Response response = CrawlQueues.this.sb.loader.load(request, profile == null ? CacheStrategy.IFEXIST : profile.cacheStrategy(), BlacklistType.CRAWLER, profile.getAgent());
if (response == null) { if (response == null) {
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED); request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.log.isFine()) { if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("problem loading " + this.request.url().toString() + ": no content (possibly caused by cache policy)"); CrawlQueues.log.fine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
} }
result = "no content (possibly caused by cache policy)"; result = "no content (possibly caused by cache policy)";
} else { } else {
this.request.setStatus("loaded", WorkflowJob.STATUS_RUNNING); request.setStatus("loaded", WorkflowJob.STATUS_RUNNING);
final String storedFailMessage = CrawlQueues.this.sb.toIndexer(response); final String storedFailMessage = CrawlQueues.this.sb.toIndexer(response);
this.request.setStatus("enqueued-" + ((storedFailMessage == null) ? "ok" : "fail"), WorkflowJob.STATUS_FINISHED); request.setStatus("enqueued-" + ((storedFailMessage == null) ? "ok" : "fail"), WorkflowJob.STATUS_FINISHED);
result = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage; result = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
} }
} catch (final IOException e) { } catch (final IOException e) {
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED); request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.log.isFine()) { if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("problem loading " + this.request.url().toString() + ": " + e.getMessage()); CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
} }
result = "load error - " + e.getMessage(); result = "load error - " + e.getMessage();
} }
if (result != null) { if (result != null) {
CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1); CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1);
this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED); request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
} else { } else {
this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED); request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
} }
} }
} catch (final Exception e) { } catch (final Exception e) {
CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, e.getMessage() + " - in worker", -1); CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, e.getMessage() + " - in worker", -1);
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED); request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED);
} finally { } finally {
CrawlQueues.this.workers.remove(this.code); request = null;
}
}
} catch (InterruptedException e2) {
ConcurrentLog.logException(e2);
} }
} }
} }

@ -96,6 +96,24 @@ public class Request extends WorkflowJob
private String statusMessage; private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
public Request() {
// used only to create poison entries
this.initiator = null;
this.url = null;
this.refhash = null;
this.name = null;
this.appdate = 0;
this.profileHandle = null;
this.depth = 0;
this.anchors = 0;
this.forkfactor = 0;
this.flags = null;
this.statusMessage = null;
this.initialHash = 0;
this.status = 0;
this.size = 0;
}
/** /**
* convenience method for 'full' request object * convenience method for 'full' request object
* *

@ -1698,7 +1698,7 @@ public final class Switchboard extends serverSwitch {
*/ */
public boolean cleanProfiles() throws InterruptedException { public boolean cleanProfiles() throws InterruptedException {
if (getIndexingProcessorsQueueSize() > 0 || if (getIndexingProcessorsQueueSize() > 0 ||
this.crawlQueues.workerSize() > 0 || this.crawlQueues.activeWorkerEntries().size() > 0 ||
this.crawlQueues.coreCrawlJobSize() > 0 || this.crawlQueues.coreCrawlJobSize() > 0 ||
this.crawlQueues.limitCrawlJobSize() > 0 || this.crawlQueues.limitCrawlJobSize() > 0 ||
this.crawlQueues.remoteTriggeredCrawlJobSize() > 0 || this.crawlQueues.remoteTriggeredCrawlJobSize() > 0 ||
@ -2073,9 +2073,6 @@ public final class Switchboard extends serverSwitch {
} }
} }
// cleanup crawl loader jobs
this.crawlQueues.cleanup();
// refresh recrawl dates // refresh recrawl dates
try { try {
CrawlProfile selentry; CrawlProfile selentry;

Loading…
Cancel
Save