From 9bfb2641dbaf35eea53a50c90eb9d495e922face Mon Sep 17 00:00:00 2001
From: orbiter
Date: Wed, 1 Apr 2009 20:13:57 +0000
Subject: [PATCH] - removed deprecated threads - added automatic http client
reset. this was necessary because excessive intranet crawling caused
deadlocks. this hack solved the problem.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5768 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
source/de/anomic/crawler/CrawlQueues.java | 8 ++-
source/de/anomic/crawler/CrawlStacker.java | 8 +--
source/de/anomic/crawler/Latency.java | 2 +-
source/de/anomic/http/httpClient.java | 51 +++++++++++--------
source/de/anomic/kelondro/text/IndexCell.java | 1 +
.../de/anomic/plasma/plasmaSwitchboard.java | 12 -----
.../plasma/plasmaSwitchboardConstants.java | 24 ---------
7 files changed, 42 insertions(+), 64 deletions(-)
diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java
index eb4f84819..310a6c726 100644
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@@ -36,6 +36,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
+import de.anomic.http.httpClient;
import de.anomic.kelondro.table.FlexWidthArray;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.DateFormatter;
@@ -571,7 +572,8 @@ public class CrawlQueues {
1,
"denied by robots.txt");
eentry.store();
- errorURL.push(eentry);
+ errorURL.push(eentry);
+ this.entry.setStatus("worker-disallowed", serverProcessorJob.STATUS_FINISHED);
} else {
// starting a load from the internet
this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
@@ -585,6 +587,7 @@ public class CrawlQueues {
"cannot load: " + result);
eentry.store();
errorURL.push(eentry);
+ this.entry.setStatus("worker-error", serverProcessorJob.STATUS_FINISHED);
} else {
this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED);
}
@@ -599,9 +602,10 @@ public class CrawlQueues {
eentry.store();
errorURL.push(eentry);
e.printStackTrace();
+ httpClient.initConnectionManager();
+ this.entry.setStatus("worker-exception", serverProcessorJob.STATUS_FINISHED);
} finally {
workers.remove(code);
- this.entry.setStatus("worker-finalized", serverProcessorJob.STATUS_FINISHED);
}
}
diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java
index b64375dda..5a807d1ca 100644
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@@ -289,25 +289,25 @@ public final class CrawlStacker {
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
- this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT));
+ //this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT));
} else if (local) {
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
- this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
+ //this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if (proxy) {
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
- this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
+ //this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if (remote) {
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
- this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE));
+ //this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE));
}
return null;
diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java
index 2e2bc57fc..b24314cf0 100644
--- a/source/de/anomic/crawler/Latency.java
+++ b/source/de/anomic/crawler/Latency.java
@@ -185,7 +185,7 @@ public class Latency {
}
public void slowdown() {
this.lastacc = System.currentTimeMillis();
- this.timeacc = Math.min(60000, average() * 5);
+ this.timeacc = Math.min(60000, average() * 2);
this.count = 1;
}
public int count() {
diff --git a/source/de/anomic/http/httpClient.java b/source/de/anomic/http/httpClient.java
index 91ab16a4c..a1de3bfe7 100644
--- a/source/de/anomic/http/httpClient.java
+++ b/source/de/anomic/http/httpClient.java
@@ -76,8 +76,8 @@ public class httpClient {
* "the HttpClient instance and connection manager should be shared among all threads for maximum efficiency."
* (Concurrent execution of HTTP methods, http://hc.apache.org/httpclient-3.x/performance.html)
*/
- private final static MultiThreadedHttpConnectionManager conManager = new MultiThreadedHttpConnectionManager();
- private final static HttpClient apacheHttpClient = new HttpClient(conManager);
+ private static MultiThreadedHttpConnectionManager conManager = null;
+ private static HttpClient apacheHttpClient = null;
// last ; must be before location (this is parsed)
private final static String jakartaUserAgent = " " +
@@ -87,25 +87,8 @@ public class httpClient {
/**
* set options for client
*/
- // simple user agent
- setUserAgent("yacy (www.yacy.net; " + getSystemOST() + ")");
- // only one retry
- apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
- new DefaultHttpMethodRetryHandler(1, false));
- /**
- * set options for connection manager
- */
- // conManager.getParams().setDefaultMaxConnectionsPerHost(4); // default 2
- HostConfiguration localHostConfiguration = new HostConfiguration();
- conManager.getParams().setMaxTotalConnections(200); // Proxy may need many connections
- conManager.getParams().setConnectionTimeout(60000); // set a default timeout
- conManager.getParams().setDefaultMaxConnectionsPerHost(3); // prevent DoS by mistake
- localHostConfiguration.setHost("localhost");
- conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100);
- localHostConfiguration.setHost("127.0.0.1");
- conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100);
- // TODO should this be configurable?
-
+ initConnectionManager();
+
// accept self-signed or untrusted certificates
Protocol.registerProtocol("https", new Protocol("https",
(ProtocolSocketFactory) new AcceptEverythingSSLProtcolSocketFactory(), 443));
@@ -125,6 +108,32 @@ public class httpClient {
System.setProperty("sun.net.client.defaultReadTimeout", "60000");
}
+ public static void initConnectionManager() {
+ MultiThreadedHttpConnectionManager.shutdownAll();
+ conManager = new MultiThreadedHttpConnectionManager();
+ apacheHttpClient = new HttpClient(conManager);
+
+ /**
+ * set options for connection manager
+ */
+ // conManager.getParams().setDefaultMaxConnectionsPerHost(4); // default 2
+ HostConfiguration localHostConfiguration = new HostConfiguration();
+ conManager.getParams().setMaxTotalConnections(200); // Proxy may need many connections
+ conManager.getParams().setConnectionTimeout(60000); // set a default timeout
+ conManager.getParams().setDefaultMaxConnectionsPerHost(10);
+ localHostConfiguration.setHost("localhost");
+ conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100);
+ localHostConfiguration.setHost("127.0.0.1");
+ conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100);
+
+ // only one retry
+ apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
+ new DefaultHttpMethodRetryHandler(1, false));
+ // simple user agent
+ setUserAgent("yacy (www.yacy.net; " + getSystemOST() + ")");
+
+ }
+
/**
* every x milliseconds do a cleanup (close old connections)
*
diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java
index 640a1d4c8..9f6aac231 100644
--- a/source/de/anomic/kelondro/text/IndexCell.java
+++ b/source/de/anomic/kelondro/text/IndexCell.java
@@ -284,6 +284,7 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
// clean-up the cache
if (this.lastCleanup + cleanupCycle > System.currentTimeMillis()) return;
+ //System.out.println("----cleanup check");
this.array.shrink(this.targetFileSize, this.maxFileSize);
this.lastCleanup = System.currentTimeMillis();
}
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index fadbd6ad5..948e399a9 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -605,8 +605,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String INDEXER = "80_indexing"
- * Name of the indexer thread, performing the actual indexing of a website
- */
- public static final String PARSER = "74_indexing";
- public static final String PARSER_MEMPREREQ = "74_indexing_memprereq";
- public static final String PARSER_IDLESLEEP = "74_indexing_idlesleep";
- public static final String PARSER_BUSYSLEEP = "74_indexing_busysleep";
- public static final String PARSER_METHOD_START = "deQueueProcess";
- public static final String PARSER_METHOD_JOBCOUNT = "queueSize";
- public static final String PARSER_METHOD_FREEMEM = "deQueueFreeMem";
// 80_indexing
/**
* public static final String INDEXER = "80_indexing"
@@ -135,18 +123,6 @@ public final class plasmaSwitchboardConstants {
public static final String INDEXER_METHOD_JOBCOUNT = "queueSize";
public static final String INDEXER_METHOD_FREEMEM = "deQueueFreeMem";
public static final String INDEXER_SLOTS = "indexer.slots";
- // 85_cacheflush
- /**
- * the cache flush thread starts a flush of the RAM cache.
- * This periodic flushing replaces the permanent flushing
- */
- public static final String CACHEFLUSH = "85_cacheflush";
- public static final String CACHEFLUSH_MEMPREREQ = "85_cacheflush_memprereq";
- public static final String CACHEFLUSH_IDLESLEEP = "85_cacheflush_idlesleep";
- public static final String CACHEFLUSH_BUSYSLEEP = "85_cacheflush_busysleep";
- public static final String CACHEFLUSH_METHOD_START = "rwiCacheFlush";
- public static final String CACHEFLUSH_METHOD_JOBCOUNT = "rwiCacheSize";
- public static final String CACHEFLUSH_METHOD_FREEMEM = "deQueueFreeMem";
// 90_cleanup
/**
* public static final String CLEANUP = "90_cleanup"