From 330eae7cf3ae3847ad28c1a7ffa494b2ae8b53af Mon Sep 17 00:00:00 2001 From: theli Date: Sun, 21 Aug 2005 22:52:46 +0000 Subject: [PATCH] *) Normalizing CrawlerStartURL now before crawling is started *) CrawlWorker also does a URL normalization now before following the redirection URL *) CrawlWorker removes redirection URL correctly from noticeURL stack now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@571 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreate_p.java | 17 ++++++++++++++++- source/de/anomic/plasma/plasmaCrawlLoader.java | 17 +++++++++-------- source/de/anomic/plasma/plasmaCrawlWorker.java | 17 +++++++++++++++-- source/de/anomic/plasma/plasmaSwitchboard.java | 2 +- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 615a4b476..4a6d5e1c2 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -62,6 +62,7 @@ import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; +import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaCrawlProfile; @@ -123,8 +124,15 @@ public class IndexCreate_p { String crawlingMode = post.get("crawlingMode","url"); if (crawlingMode.equals("url")) { - String crawlingStart = (String) post.get("crawlingURL"); + // getting the crawljob start url + String crawlingStart = post.get("crawlingURL",""); + crawlingStart = crawlingStart.trim(); + + // adding the prefix http:// if necessary if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart; + + // normalizing URL + crawlingStart = plasmaParser.urlNormalform(crawlingStart); // check if url is proper URL crawlingStartURL = null; @@ -216,6 +224,13 @@ public class IndexCreate_p { Map.Entry e = (Map.Entry) interator.next(); String nexturlstring = (String) e.getKey(); + if (nexturlstring == null) continue; + + nexturlstring = nexturlstring.trim(); + + // normalizing URL + nexturlstring = plasmaParser.urlNormalform(nexturlstring); + // generating an url object URL nexturlURL = null; try { diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index 9d93656cf..dc6618bb4 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -55,6 +55,8 @@ import org.apache.commons.pool.impl.GenericObjectPool; public final class plasmaCrawlLoader extends Thread { + static plasmaSwitchboard switchboard; + private final plasmaHTCache cacheManager; private final int socketTimeout; private final serverLog log; @@ -66,7 +68,6 @@ public final class plasmaCrawlLoader extends Thread { private boolean stopped = false; public plasmaCrawlLoader( - plasmaSwitchboard sb, plasmaHTCache cacheManager, serverLog log) { @@ -75,7 +76,7 @@ public final class plasmaCrawlLoader extends Thread { this.cacheManager = cacheManager; this.log = log; - this.socketTimeout = Integer.parseInt(sb.getConfig("clientTimeout", "10000")); + this.socketTimeout = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000")); // configuring the crawler messagequeue this.theQueue = new CrawlerMessageQueue(); @@ -86,12 +87,12 @@ public final class plasmaCrawlLoader extends Thread { // The maximum number of active connections that can be allocated from pool at the same time, // 0 for no limit - this.cralwerPoolConfig.maxActive = Integer.parseInt(sb.getConfig("crawlerMaxActiveThreads","10")); + this.cralwerPoolConfig.maxActive = Integer.parseInt(switchboard.getConfig("crawlerMaxActiveThreads","10")); // The maximum number of idle connections connections in the pool // 0 = no limit. - this.cralwerPoolConfig.maxIdle = Integer.parseInt(sb.getConfig("crawlerMaxIdleThreads","7")); - this.cralwerPoolConfig.minIdle = Integer.parseInt(sb.getConfig("crawlerMinIdleThreads","5")); + this.cralwerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawlerMaxIdleThreads","7")); + this.cralwerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawlerMinIdleThreads","5")); // block undefinitely this.cralwerPoolConfig.maxWait = -1; @@ -106,9 +107,9 @@ public final class plasmaCrawlLoader extends Thread { this.theThreadGroup, cacheManager, socketTimeout, - sb.getConfig("remoteProxyUse","false").equals("true"), - sb.getConfig("remoteProxyHost",""), - Integer.parseInt(sb.getConfig("remoteProxyPort","3128")), + switchboard.getConfig("remoteProxyUse","false").equals("true"), + switchboard.getConfig("remoteProxyHost",""), + Integer.parseInt(switchboard.getConfig("remoteProxyPort","3128")), log); this.crawlwerPool = new CrawlerPool(theFactory,this.cralwerPoolConfig,this.theThreadGroup); diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index a9bdec5f2..473e6f37c 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -365,8 +365,15 @@ public final class plasmaCrawlWorker extends Thread { } else if (res.status.startsWith("30")) { if (crawlingRetryCount < 0) { if (res.responseHeader.containsKey(httpHeader.LOCATION)) { - // generating the new url - URL redirectionUrl = new URL(url, (String) res.responseHeader.get(httpHeader.LOCATION)); + // getting redirection URL + String redirectionUrlString = (String) res.responseHeader.get(httpHeader.LOCATION); + redirectionUrlString = redirectionUrlString.trim(); + + // normalizing URL + redirectionUrlString = plasmaParser.urlNormalform(redirectionUrlString); + + // generating the new URL object + URL redirectionUrl = new URL(url, redirectionUrlString); // returning the used httpc httpc.returnInstance(remote); @@ -382,6 +389,12 @@ public final class plasmaCrawlWorker extends Thread { return; } + // generating url hash + String urlhash = plasmaURL.urlHash(redirectionUrl); + + // removing url from loader queue + plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash); + // retry crawling with new url load(redirectionUrl, name, diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index eede8a499..0bade23d8 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -297,8 +297,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser catch (NumberFormatException e) { remoteport = 3128; } crawlSlots = Integer.parseInt(getConfig("crawlerMaxActiveThreads", "10")); + plasmaCrawlLoader.switchboard = this; this.cacheLoader = new plasmaCrawlLoader( - this, this.cacheManager, this.log);