*) Bugfix for "Crawler data will not be removed from htcache if content parsing failed"

See: http://www.yacy-forum.de/viewtopic.php?t=965&highlight=ramdisk
*) Making ACCEPT_LANGUAGE configureable for crawler
   See: http://www.yacy-forum.de/viewtopic.php?p=8327

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@583 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent cf03c139e7
commit 17be77a468

@ -197,9 +197,9 @@ public class Performance_p {
switchboard.cacheLoader.setPoolConfig(crawlerPoolConfig);
// storing the new values into configfile
switchboard.setConfig("crawlerMaxActiveThreads",maxActive);
switchboard.setConfig("crawlerMaxIdleThreads",maxIdle);
switchboard.setConfig("crawlerMinIdleThreads",minIdle);
switchboard.setConfig("crawler.MaxActiveThreads",maxActive);
switchboard.setConfig("crawler.MaxIdleThreads",maxIdle);
switchboard.setConfig("crawler.MinIdleThreads",minIdle);
/*
* configuring the http pool

@ -46,10 +46,8 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import de.anomic.server.serverCore;
import de.anomic.server.serverSemaphore;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverCore.Session;
import org.apache.commons.pool.impl.GenericObjectPool;
@ -76,7 +74,7 @@ public final class plasmaCrawlLoader extends Thread {
this.cacheManager = cacheManager;
this.log = log;
this.socketTimeout = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000"));
this.socketTimeout = Integer.parseInt(switchboard.getConfig("crawler.clientTimeout", "10000"));
// configuring the crawler messagequeue
this.theQueue = new CrawlerMessageQueue();
@ -87,12 +85,12 @@ public final class plasmaCrawlLoader extends Thread {
// The maximum number of active connections that can be allocated from pool at the same time,
// 0 for no limit
this.cralwerPoolConfig.maxActive = Integer.parseInt(switchboard.getConfig("crawlerMaxActiveThreads","10"));
this.cralwerPoolConfig.maxActive = Integer.parseInt(switchboard.getConfig("crawler.MaxActiveThreads","10"));
// The maximum number of idle connections connections in the pool
// 0 = no limit.
this.cralwerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawlerMaxIdleThreads","7"));
this.cralwerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawlerMinIdleThreads","5"));
this.cralwerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawler.MaxIdleThreads","7"));
this.cralwerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawler.MinIdleThreads","5"));
// block undefinitely
this.cralwerPoolConfig.maxWait = -1;

@ -301,11 +301,15 @@ public final class plasmaCrawlWorker extends Thread {
// take a file from the net
httpc remote = null;
try {
plasmaSwitchboard sb = plasmaCrawlLoader.switchboard;
// create a request header
httpHeader requestHeader = new httpHeader();
requestHeader.put("User-Agent", httpdProxyHandler.userAgent);
requestHeader.put("Referer", referer);
if (useContentEncodingGzip) requestHeader.put("Accept-Encoding", "gzip,deflate");
requestHeader.put(httpHeader.USER_AGENT, httpdProxyHandler.userAgent);
requestHeader.put(httpHeader.REFERER, referer);
requestHeader.put(httpHeader.ACCEPT_LANGUAGE, sb.getConfig("crawler.acceptLanguage","en-us,en;q=0.5"));
requestHeader.put(httpHeader.ACCEPT_CHARSET, sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
if (useContentEncodingGzip) requestHeader.put(httpHeader.ACCEPT_ENCODING, "gzip,deflate");
//System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG
@ -433,6 +437,7 @@ public final class plasmaCrawlWorker extends Thread {
log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + url.toString() +
"'. Retrying request without using gzip content encoding.");
retryCrawling = true;
// java.net.SocketTimeoutException: connect timed out
} else if (errorMsg.indexOf("Socket time-out: Read timed out") >= 0) {
log.logWarning("CRAWLER Read timeout while receiving content from '" + url.toString() +
"'. Retrying request.");

@ -297,7 +297,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
try { remoteport = Integer.parseInt(getConfig("remoteProxyPort","3128")); }
catch (NumberFormatException e) { remoteport = 3128; }
crawlSlots = Integer.parseInt(getConfig("crawlerMaxActiveThreads", "10"));
crawlSlots = Integer.parseInt(getConfig("crawler.MaxActiveThreads", "10"));
plasmaCrawlLoader.switchboard = this;
this.cacheLoader = new plasmaCrawlLoader(
this.cacheManager,
@ -953,7 +953,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
}
}
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url().toString() +
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.normalizedURLString() +
", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
}
@ -1035,13 +1035,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
// explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) cacheManager.deleteFile(entry.url());
document = null; entry = null;
document = null;
} catch (IOException e) {
log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString());
} finally {
// explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) cacheManager.deleteFile(entry.url());
entry = null;
}
}
@ -1142,7 +1143,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return;
}
cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo(stats + ": enqueued for load " + urlEntry.url());
log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]");
return;
}

@ -25,11 +25,6 @@ httpdMaxActiveSessions = 150
httpdMaxIdleSessions = 75
httpdMinIdleSessions = 5
# maximum number of crawler threads
crawlerMaxActiveThreads = 10
crawlerMaxIdleThreads = 7
crawlerMinIdleThreads = 5
# default root path for the file server
# may be overridden by the htdocs parameter
# users shall be encouraged to use the htdocs path for individual content,
@ -510,3 +505,13 @@ msgForwardingTo=root@localhost
#onlineCautionDelay: delay time after proxy usage before crawling is resumed
onlineCautionDelay=30000
# Some configuration values for the crawler
crawler.acceptLanguage=en-us,en;q=0.5
crawler.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
crawler.clientTimeout=9000
# maximum number of crawler threads
crawler.MaxActiveThreads = 10
crawler.MaxIdleThreads = 7
crawler.MinIdleThreads = 5
Loading…
Cancel
Save