*) Normalizing CrawlerStartURL now before crawling is started

*) CrawlWorker also does a URL normalization now before following the redirection URL
*) CrawlWorker removes redirection URL correctly from noticeURL stack now

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@571 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent ab894d26bc
commit 330eae7cf3

@ -62,6 +62,7 @@ import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaCrawlProfile;
@ -123,8 +124,15 @@ public class IndexCreate_p {
String crawlingMode = post.get("crawlingMode","url");
if (crawlingMode.equals("url")) {
String crawlingStart = (String) post.get("crawlingURL");
// getting the crawljob start url
String crawlingStart = post.get("crawlingURL","");
crawlingStart = crawlingStart.trim();
// adding the prefix http:// if necessary
if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;
// normalizing URL
crawlingStart = plasmaParser.urlNormalform(crawlingStart);
// check if url is proper
URL crawlingStartURL = null;
@ -216,6 +224,13 @@ public class IndexCreate_p {
Map.Entry e = (Map.Entry) interator.next();
String nexturlstring = (String) e.getKey();
if (nexturlstring == null) continue;
nexturlstring = nexturlstring.trim();
// normalizing URL
nexturlstring = plasmaParser.urlNormalform(nexturlstring);
// generating an url object
URL nexturlURL = null;
try {

@ -55,6 +55,8 @@ import org.apache.commons.pool.impl.GenericObjectPool;
public final class plasmaCrawlLoader extends Thread {
static plasmaSwitchboard switchboard;
private final plasmaHTCache cacheManager;
private final int socketTimeout;
private final serverLog log;
@ -66,7 +68,6 @@ public final class plasmaCrawlLoader extends Thread {
private boolean stopped = false;
public plasmaCrawlLoader(
plasmaSwitchboard sb,
plasmaHTCache cacheManager,
serverLog log) {
@ -75,7 +76,7 @@ public final class plasmaCrawlLoader extends Thread {
this.cacheManager = cacheManager;
this.log = log;
this.socketTimeout = Integer.parseInt(sb.getConfig("clientTimeout", "10000"));
this.socketTimeout = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000"));
// configuring the crawler messagequeue
this.theQueue = new CrawlerMessageQueue();
@ -86,12 +87,12 @@ public final class plasmaCrawlLoader extends Thread {
// The maximum number of active connections that can be allocated from pool at the same time,
// 0 for no limit
this.cralwerPoolConfig.maxActive = Integer.parseInt(sb.getConfig("crawlerMaxActiveThreads","10"));
this.cralwerPoolConfig.maxActive = Integer.parseInt(switchboard.getConfig("crawlerMaxActiveThreads","10"));
// The maximum number of idle connections connections in the pool
// 0 = no limit.
this.cralwerPoolConfig.maxIdle = Integer.parseInt(sb.getConfig("crawlerMaxIdleThreads","7"));
this.cralwerPoolConfig.minIdle = Integer.parseInt(sb.getConfig("crawlerMinIdleThreads","5"));
this.cralwerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawlerMaxIdleThreads","7"));
this.cralwerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawlerMinIdleThreads","5"));
// block undefinitely
this.cralwerPoolConfig.maxWait = -1;
@ -106,9 +107,9 @@ public final class plasmaCrawlLoader extends Thread {
this.theThreadGroup,
cacheManager,
socketTimeout,
sb.getConfig("remoteProxyUse","false").equals("true"),
sb.getConfig("remoteProxyHost",""),
Integer.parseInt(sb.getConfig("remoteProxyPort","3128")),
switchboard.getConfig("remoteProxyUse","false").equals("true"),
switchboard.getConfig("remoteProxyHost",""),
Integer.parseInt(switchboard.getConfig("remoteProxyPort","3128")),
log);
this.crawlwerPool = new CrawlerPool(theFactory,this.cralwerPoolConfig,this.theThreadGroup);

@ -365,8 +365,15 @@ public final class plasmaCrawlWorker extends Thread {
} else if (res.status.startsWith("30")) {
if (crawlingRetryCount < 0) {
if (res.responseHeader.containsKey(httpHeader.LOCATION)) {
// generating the new url
URL redirectionUrl = new URL(url, (String) res.responseHeader.get(httpHeader.LOCATION));
// getting redirection URL
String redirectionUrlString = (String) res.responseHeader.get(httpHeader.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
// normalizing URL
redirectionUrlString = plasmaParser.urlNormalform(redirectionUrlString);
// generating the new URL object
URL redirectionUrl = new URL(url, redirectionUrlString);
// returning the used httpc
httpc.returnInstance(remote);
@ -382,6 +389,12 @@ public final class plasmaCrawlWorker extends Thread {
return;
}
// generating url hash
String urlhash = plasmaURL.urlHash(redirectionUrl);
// removing url from loader queue
plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash);
// retry crawling with new url
load(redirectionUrl,
name,

@ -297,8 +297,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
catch (NumberFormatException e) { remoteport = 3128; }
crawlSlots = Integer.parseInt(getConfig("crawlerMaxActiveThreads", "10"));
plasmaCrawlLoader.switchboard = this;
this.cacheLoader = new plasmaCrawlLoader(
this,
this.cacheManager,
this.log);

Loading…
Cancel
Save