From 9bfb2641dbaf35eea53a50c90eb9d495e922face Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 1 Apr 2009 20:13:57 +0000 Subject: [PATCH] - removed deprecated threads - added automatic http client reset. this was necessary because excessive intranet crawling caused deadlocks. this hack solved the problem. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5768 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/CrawlQueues.java | 8 ++- source/de/anomic/crawler/CrawlStacker.java | 8 +-- source/de/anomic/crawler/Latency.java | 2 +- source/de/anomic/http/httpClient.java | 51 +++++++++++-------- source/de/anomic/kelondro/text/IndexCell.java | 1 + .../de/anomic/plasma/plasmaSwitchboard.java | 12 ----- .../plasma/plasmaSwitchboardConstants.java | 24 --------- 7 files changed, 42 insertions(+), 64 deletions(-) diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index eb4f84819..310a6c726 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -36,6 +36,7 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import de.anomic.http.httpClient; import de.anomic.kelondro.table.FlexWidthArray; import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.DateFormatter; @@ -571,7 +572,8 @@ public class CrawlQueues { 1, "denied by robots.txt"); eentry.store(); - errorURL.push(eentry); + errorURL.push(eentry); + this.entry.setStatus("worker-disallowed", serverProcessorJob.STATUS_FINISHED); } else { // starting a load from the internet this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING); @@ -585,6 +587,7 @@ public class CrawlQueues { "cannot load: " + result); eentry.store(); errorURL.push(eentry); + this.entry.setStatus("worker-error", serverProcessorJob.STATUS_FINISHED); } else { this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED); } @@ -599,9 +602,10 @@ public class CrawlQueues { eentry.store(); errorURL.push(eentry); e.printStackTrace(); + httpClient.initConnectionManager(); + this.entry.setStatus("worker-exception", serverProcessorJob.STATUS_FINISHED); } finally { workers.remove(code); - this.entry.setStatus("worker-finalized", serverProcessorJob.STATUS_FINISHED); } } diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index b64375dda..5a807d1ca 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -289,25 +289,25 @@ public final class CrawlStacker { //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry); //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); - this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT)); + //this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT)); } else if (local) { if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); + //this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); } else if (proxy) { if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); + //this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); } else if (remote) { //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry); //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); - this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE)); + //this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE)); } return null; diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 2e2bc57fc..b24314cf0 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -185,7 +185,7 @@ public class Latency { } public void slowdown() { this.lastacc = System.currentTimeMillis(); - this.timeacc = Math.min(60000, average() * 5); + this.timeacc = Math.min(60000, average() * 2); this.count = 1; } public int count() { diff --git a/source/de/anomic/http/httpClient.java b/source/de/anomic/http/httpClient.java index 91ab16a4c..a1de3bfe7 100644 --- a/source/de/anomic/http/httpClient.java +++ b/source/de/anomic/http/httpClient.java @@ -76,8 +76,8 @@ public class httpClient { * "the HttpClient instance and connection manager should be shared among all threads for maximum efficiency." * (Concurrent execution of HTTP methods, http://hc.apache.org/httpclient-3.x/performance.html) */ - private final static MultiThreadedHttpConnectionManager conManager = new MultiThreadedHttpConnectionManager(); - private final static HttpClient apacheHttpClient = new HttpClient(conManager); + private static MultiThreadedHttpConnectionManager conManager = null; + private static HttpClient apacheHttpClient = null; // last ; must be before location (this is parsed) private final static String jakartaUserAgent = " " + @@ -87,25 +87,8 @@ public class httpClient { /** * set options for client */ - // simple user agent - setUserAgent("yacy (www.yacy.net; " + getSystemOST() + ")"); - // only one retry - apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, - new DefaultHttpMethodRetryHandler(1, false)); - /** - * set options for connection manager - */ - // conManager.getParams().setDefaultMaxConnectionsPerHost(4); // default 2 - HostConfiguration localHostConfiguration = new HostConfiguration(); - conManager.getParams().setMaxTotalConnections(200); // Proxy may need many connections - conManager.getParams().setConnectionTimeout(60000); // set a default timeout - conManager.getParams().setDefaultMaxConnectionsPerHost(3); // prevent DoS by mistake - localHostConfiguration.setHost("localhost"); - conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); - localHostConfiguration.setHost("127.0.0.1"); - conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); - // TODO should this be configurable? - + initConnectionManager(); + // accept self-signed or untrusted certificates Protocol.registerProtocol("https", new Protocol("https", (ProtocolSocketFactory) new AcceptEverythingSSLProtcolSocketFactory(), 443)); @@ -125,6 +108,32 @@ public class httpClient { System.setProperty("sun.net.client.defaultReadTimeout", "60000"); } + public static void initConnectionManager() { + MultiThreadedHttpConnectionManager.shutdownAll(); + conManager = new MultiThreadedHttpConnectionManager(); + apacheHttpClient = new HttpClient(conManager); + + /** + * set options for connection manager + */ + // conManager.getParams().setDefaultMaxConnectionsPerHost(4); // default 2 + HostConfiguration localHostConfiguration = new HostConfiguration(); + conManager.getParams().setMaxTotalConnections(200); // Proxy may need many connections + conManager.getParams().setConnectionTimeout(60000); // set a default timeout + conManager.getParams().setDefaultMaxConnectionsPerHost(10); + localHostConfiguration.setHost("localhost"); + conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); + localHostConfiguration.setHost("127.0.0.1"); + conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); + + // only one retry + apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, + new DefaultHttpMethodRetryHandler(1, false)); + // simple user agent + setUserAgent("yacy (www.yacy.net; " + getSystemOST() + ")"); + + } + /** * every x milliseconds do a cleanup (close old connections) * diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java index 640a1d4c8..9f6aac231 100644 --- a/source/de/anomic/kelondro/text/IndexCell.java +++ b/source/de/anomic/kelondro/text/IndexCell.java @@ -284,6 +284,7 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn // clean-up the cache if (this.lastCleanup + cleanupCycle > System.currentTimeMillis()) return; + //System.out.println("----cleanup check"); this.array.shrink(this.targetFileSize, this.maxFileSize); this.lastCleanup = System.currentTimeMillis(); } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index fadbd6ad5..948e399a9 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -605,8 +605,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String INDEXER = "80_indexing"

- *

Name of the indexer thread, performing the actual indexing of a website

- */ - public static final String PARSER = "74_indexing"; - public static final String PARSER_MEMPREREQ = "74_indexing_memprereq"; - public static final String PARSER_IDLESLEEP = "74_indexing_idlesleep"; - public static final String PARSER_BUSYSLEEP = "74_indexing_busysleep"; - public static final String PARSER_METHOD_START = "deQueueProcess"; - public static final String PARSER_METHOD_JOBCOUNT = "queueSize"; - public static final String PARSER_METHOD_FREEMEM = "deQueueFreeMem"; // 80_indexing /** *

public static final String INDEXER = "80_indexing"

@@ -135,18 +123,6 @@ public final class plasmaSwitchboardConstants { public static final String INDEXER_METHOD_JOBCOUNT = "queueSize"; public static final String INDEXER_METHOD_FREEMEM = "deQueueFreeMem"; public static final String INDEXER_SLOTS = "indexer.slots"; - // 85_cacheflush - /** - * the cache flush thread starts a flush of the RAM cache. - * This periodic flushing replaces the permanent flushing - */ - public static final String CACHEFLUSH = "85_cacheflush"; - public static final String CACHEFLUSH_MEMPREREQ = "85_cacheflush_memprereq"; - public static final String CACHEFLUSH_IDLESLEEP = "85_cacheflush_idlesleep"; - public static final String CACHEFLUSH_BUSYSLEEP = "85_cacheflush_busysleep"; - public static final String CACHEFLUSH_METHOD_START = "rwiCacheFlush"; - public static final String CACHEFLUSH_METHOD_JOBCOUNT = "rwiCacheSize"; - public static final String CACHEFLUSH_METHOD_FREEMEM = "deQueueFreeMem"; // 90_cleanup /** *

public static final String CLEANUP = "90_cleanup"