*) Bugfix for "Crawler data will not be removed from htcache if content parsing failed"

See: http://www.yacy-forum.de/viewtopic.php?t=965&highlight=ramdisk
*) Making ACCEPT_LANGUAGE configureable for crawler
   See: http://www.yacy-forum.de/viewtopic.php?p=8327

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@583 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent cf03c139e7
commit 17be77a468

@ -197,9 +197,9 @@ public class Performance_p {
switchboard.cacheLoader.setPoolConfig(crawlerPoolConfig); switchboard.cacheLoader.setPoolConfig(crawlerPoolConfig);
// storing the new values into configfile // storing the new values into configfile
switchboard.setConfig("crawlerMaxActiveThreads",maxActive); switchboard.setConfig("crawler.MaxActiveThreads",maxActive);
switchboard.setConfig("crawlerMaxIdleThreads",maxIdle); switchboard.setConfig("crawler.MaxIdleThreads",maxIdle);
switchboard.setConfig("crawlerMinIdleThreads",minIdle); switchboard.setConfig("crawler.MinIdleThreads",minIdle);
/* /*
* configuring the http pool * configuring the http pool

@ -46,10 +46,8 @@ import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import de.anomic.server.serverCore;
import de.anomic.server.serverSemaphore; import de.anomic.server.serverSemaphore;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.server.serverCore.Session;
import org.apache.commons.pool.impl.GenericObjectPool; import org.apache.commons.pool.impl.GenericObjectPool;
@ -76,7 +74,7 @@ public final class plasmaCrawlLoader extends Thread {
this.cacheManager = cacheManager; this.cacheManager = cacheManager;
this.log = log; this.log = log;
this.socketTimeout = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000")); this.socketTimeout = Integer.parseInt(switchboard.getConfig("crawler.clientTimeout", "10000"));
// configuring the crawler messagequeue // configuring the crawler messagequeue
this.theQueue = new CrawlerMessageQueue(); this.theQueue = new CrawlerMessageQueue();
@ -87,12 +85,12 @@ public final class plasmaCrawlLoader extends Thread {
// The maximum number of active connections that can be allocated from pool at the same time, // The maximum number of active connections that can be allocated from pool at the same time,
// 0 for no limit // 0 for no limit
this.cralwerPoolConfig.maxActive = Integer.parseInt(switchboard.getConfig("crawlerMaxActiveThreads","10")); this.cralwerPoolConfig.maxActive = Integer.parseInt(switchboard.getConfig("crawler.MaxActiveThreads","10"));
// The maximum number of idle connections connections in the pool // The maximum number of idle connections connections in the pool
// 0 = no limit. // 0 = no limit.
this.cralwerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawlerMaxIdleThreads","7")); this.cralwerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawler.MaxIdleThreads","7"));
this.cralwerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawlerMinIdleThreads","5")); this.cralwerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawler.MinIdleThreads","5"));
// block undefinitely // block undefinitely
this.cralwerPoolConfig.maxWait = -1; this.cralwerPoolConfig.maxWait = -1;

@ -301,11 +301,15 @@ public final class plasmaCrawlWorker extends Thread {
// take a file from the net // take a file from the net
httpc remote = null; httpc remote = null;
try { try {
plasmaSwitchboard sb = plasmaCrawlLoader.switchboard;
// create a request header // create a request header
httpHeader requestHeader = new httpHeader(); httpHeader requestHeader = new httpHeader();
requestHeader.put("User-Agent", httpdProxyHandler.userAgent); requestHeader.put(httpHeader.USER_AGENT, httpdProxyHandler.userAgent);
requestHeader.put("Referer", referer); requestHeader.put(httpHeader.REFERER, referer);
if (useContentEncodingGzip) requestHeader.put("Accept-Encoding", "gzip,deflate"); requestHeader.put(httpHeader.ACCEPT_LANGUAGE, sb.getConfig("crawler.acceptLanguage","en-us,en;q=0.5"));
requestHeader.put(httpHeader.ACCEPT_CHARSET, sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
if (useContentEncodingGzip) requestHeader.put(httpHeader.ACCEPT_ENCODING, "gzip,deflate");
//System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG //System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG
@ -433,6 +437,7 @@ public final class plasmaCrawlWorker extends Thread {
log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + url.toString() + log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + url.toString() +
"'. Retrying request without using gzip content encoding."); "'. Retrying request without using gzip content encoding.");
retryCrawling = true; retryCrawling = true;
// java.net.SocketTimeoutException: connect timed out
} else if (errorMsg.indexOf("Socket time-out: Read timed out") >= 0) { } else if (errorMsg.indexOf("Socket time-out: Read timed out") >= 0) {
log.logWarning("CRAWLER Read timeout while receiving content from '" + url.toString() + log.logWarning("CRAWLER Read timeout while receiving content from '" + url.toString() +
"'. Retrying request."); "'. Retrying request.");

@ -297,7 +297,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
try { remoteport = Integer.parseInt(getConfig("remoteProxyPort","3128")); } try { remoteport = Integer.parseInt(getConfig("remoteProxyPort","3128")); }
catch (NumberFormatException e) { remoteport = 3128; } catch (NumberFormatException e) { remoteport = 3128; }
crawlSlots = Integer.parseInt(getConfig("crawlerMaxActiveThreads", "10")); crawlSlots = Integer.parseInt(getConfig("crawler.MaxActiveThreads", "10"));
plasmaCrawlLoader.switchboard = this; plasmaCrawlLoader.switchboard = this;
this.cacheLoader = new plasmaCrawlLoader( this.cacheLoader = new plasmaCrawlLoader(
this.cacheManager, this.cacheManager,
@ -953,7 +953,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
} }
} }
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url().toString() + log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.normalizedURLString() +
", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); ", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
} }
@ -1035,13 +1035,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
} }
// explicit delete/free resources document = null;
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) cacheManager.deleteFile(entry.url());
document = null; entry = null;
} catch (IOException e) { } catch (IOException e) {
log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString()); log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString());
} finally {
// explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) cacheManager.deleteFile(entry.url());
entry = null;
} }
} }
@ -1142,7 +1143,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return; return;
} }
cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile); cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo(stats + ": enqueued for load " + urlEntry.url()); log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]");
return; return;
} }

@ -25,11 +25,6 @@ httpdMaxActiveSessions = 150
httpdMaxIdleSessions = 75 httpdMaxIdleSessions = 75
httpdMinIdleSessions = 5 httpdMinIdleSessions = 5
# maximum number of crawler threads
crawlerMaxActiveThreads = 10
crawlerMaxIdleThreads = 7
crawlerMinIdleThreads = 5
# default root path for the file server # default root path for the file server
# may be overridden by the htdocs parameter # may be overridden by the htdocs parameter
# users shall be encouraged to use the htdocs path for individual content, # users shall be encouraged to use the htdocs path for individual content,
@ -510,3 +505,13 @@ msgForwardingTo=root@localhost
#onlineCautionDelay: delay time after proxy usage before crawling is resumed #onlineCautionDelay: delay time after proxy usage before crawling is resumed
onlineCautionDelay=30000 onlineCautionDelay=30000
# Some configuration values for the crawler
crawler.acceptLanguage=en-us,en;q=0.5
crawler.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
crawler.clientTimeout=9000
# maximum number of crawler threads
crawler.MaxActiveThreads = 10
crawler.MaxIdleThreads = 7
crawler.MinIdleThreads = 5
Loading…
Cancel
Save