diff --git a/htroot/Performance_p.java b/htroot/Performance_p.java index 9739afacc..075863d01 100644 --- a/htroot/Performance_p.java +++ b/htroot/Performance_p.java @@ -197,9 +197,9 @@ public class Performance_p { switchboard.cacheLoader.setPoolConfig(crawlerPoolConfig); // storing the new values into configfile - switchboard.setConfig("crawlerMaxActiveThreads",maxActive); - switchboard.setConfig("crawlerMaxIdleThreads",maxIdle); - switchboard.setConfig("crawlerMinIdleThreads",minIdle); + switchboard.setConfig("crawler.MaxActiveThreads",maxActive); + switchboard.setConfig("crawler.MaxIdleThreads",maxIdle); + switchboard.setConfig("crawler.MinIdleThreads",minIdle); /* * configuring the http pool diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index dc6618bb4..254a99124 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -46,10 +46,8 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; -import de.anomic.server.serverCore; import de.anomic.server.serverSemaphore; import de.anomic.server.logging.serverLog; -import de.anomic.server.serverCore.Session; import org.apache.commons.pool.impl.GenericObjectPool; @@ -76,7 +74,7 @@ public final class plasmaCrawlLoader extends Thread { this.cacheManager = cacheManager; this.log = log; - this.socketTimeout = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000")); + this.socketTimeout = Integer.parseInt(switchboard.getConfig("crawler.clientTimeout", "10000")); // configuring the crawler messagequeue this.theQueue = new CrawlerMessageQueue(); @@ -87,12 +85,12 @@ public final class plasmaCrawlLoader extends Thread { // The maximum number of active connections that can be allocated from pool at the same time, // 0 for no limit - this.cralwerPoolConfig.maxActive = Integer.parseInt(switchboard.getConfig("crawlerMaxActiveThreads","10")); + this.cralwerPoolConfig.maxActive = Integer.parseInt(switchboard.getConfig("crawler.MaxActiveThreads","10")); // The maximum number of idle connections connections in the pool // 0 = no limit. - this.cralwerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawlerMaxIdleThreads","7")); - this.cralwerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawlerMinIdleThreads","5")); + this.cralwerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawler.MaxIdleThreads","7")); + this.cralwerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawler.MinIdleThreads","5")); // block undefinitely this.cralwerPoolConfig.maxWait = -1; diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index 473e6f37c..b025e8cd4 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -301,11 +301,15 @@ public final class plasmaCrawlWorker extends Thread { // take a file from the net httpc remote = null; try { + plasmaSwitchboard sb = plasmaCrawlLoader.switchboard; + // create a request header httpHeader requestHeader = new httpHeader(); - requestHeader.put("User-Agent", httpdProxyHandler.userAgent); - requestHeader.put("Referer", referer); - if (useContentEncodingGzip) requestHeader.put("Accept-Encoding", "gzip,deflate"); + requestHeader.put(httpHeader.USER_AGENT, httpdProxyHandler.userAgent); + requestHeader.put(httpHeader.REFERER, referer); + requestHeader.put(httpHeader.ACCEPT_LANGUAGE, sb.getConfig("crawler.acceptLanguage","en-us,en;q=0.5")); + requestHeader.put(httpHeader.ACCEPT_CHARSET, sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7")); + if (useContentEncodingGzip) requestHeader.put(httpHeader.ACCEPT_ENCODING, "gzip,deflate"); //System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG @@ -433,6 +437,7 @@ public final class plasmaCrawlWorker extends Thread { log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + url.toString() + "'. Retrying request without using gzip content encoding."); retryCrawling = true; +// java.net.SocketTimeoutException: connect timed out } else if (errorMsg.indexOf("Socket time-out: Read timed out") >= 0) { log.logWarning("CRAWLER Read timeout while receiving content from '" + url.toString() + "'. Retrying request."); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index de77596d8..92382bd52 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -297,7 +297,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser try { remoteport = Integer.parseInt(getConfig("remoteProxyPort","3128")); } catch (NumberFormatException e) { remoteport = 3128; } - crawlSlots = Integer.parseInt(getConfig("crawlerMaxActiveThreads", "10")); + crawlSlots = Integer.parseInt(getConfig("crawler.MaxActiveThreads", "10")); plasmaCrawlLoader.switchboard = this; this.cacheLoader = new plasmaCrawlLoader( this.cacheManager, @@ -953,7 +953,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); } } - log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url().toString() + + log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.normalizedURLString() + ", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); } @@ -1035,13 +1035,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } - // explicit delete/free resources - if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) cacheManager.deleteFile(entry.url()); - document = null; entry = null; - + document = null; } catch (IOException e) { log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString()); + } finally { + // explicit delete/free resources + if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) cacheManager.deleteFile(entry.url()); + entry = null; } } @@ -1142,7 +1143,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return; } cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile); - log.logInfo(stats + ": enqueued for load " + urlEntry.url()); + log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]"); return; } diff --git a/yacy.init b/yacy.init index 42835cd1e..a791f00eb 100644 --- a/yacy.init +++ b/yacy.init @@ -25,11 +25,6 @@ httpdMaxActiveSessions = 150 httpdMaxIdleSessions = 75 httpdMinIdleSessions = 5 -# maximum number of crawler threads -crawlerMaxActiveThreads = 10 -crawlerMaxIdleThreads = 7 -crawlerMinIdleThreads = 5 - # default root path for the file server # may be overridden by the htdocs parameter # users shall be encouraged to use the htdocs path for individual content, @@ -510,3 +505,13 @@ msgForwardingTo=root@localhost #onlineCautionDelay: delay time after proxy usage before crawling is resumed onlineCautionDelay=30000 + +# Some configuration values for the crawler +crawler.acceptLanguage=en-us,en;q=0.5 +crawler.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7 +crawler.clientTimeout=9000 + +# maximum number of crawler threads +crawler.MaxActiveThreads = 10 +crawler.MaxIdleThreads = 7 +crawler.MinIdleThreads = 5 \ No newline at end of file