*) Normalizing CrawlerStartURL now before crawling is started

*) CrawlWorker also does a URL normalization now before following the redirection URL *) CrawlWorker removes redirection URL correctly from noticeURL stack now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@571 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · 330eae7cf3
parent ab894d26bc
commit 330eae7cf3
4 changed files with 41 additions and 12 deletions
--- a/htroot/IndexCreate_p.java
+++ b/htroot/IndexCreate_p.java
@ -62,6 +62,7 @@ import de.anomic.htmlFilter.htmlFilterOutputStream;
 import de.anomic.http.httpHeader;
 import de.anomic.plasma.plasmaCrawlNURL;
 import de.anomic.plasma.plasmaCrawlProfile;
+import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaURL;
 import de.anomic.plasma.plasmaCrawlProfile;
@ -123,8 +124,15 @@ public class IndexCreate_p {
                    
                    String crawlingMode = post.get("crawlingMode","url");
                    if (crawlingMode.equals("url")) {
-                        String crawlingStart = (String) post.get("crawlingURL");
+                        // getting the crawljob start url
+                        String crawlingStart = post.get("crawlingURL","");
+                        crawlingStart = crawlingStart.trim();
+                        
+                        // adding the prefix http:// if necessary
                        if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;
+
+                        // normalizing URL
+                        crawlingStart = plasmaParser.urlNormalform(crawlingStart);
                        
                        // check if url is proper
                        URL crawlingStartURL = null;
@ -216,6 +224,13 @@ public class IndexCreate_p {
                                    Map.Entry e = (Map.Entry) interator.next();
                                    String nexturlstring = (String) e.getKey();
                                    
+                                    if (nexturlstring == null) continue;
+                                    
+                                    nexturlstring = nexturlstring.trim();
+                                    
+                                    // normalizing URL
+                                    nexturlstring = plasmaParser.urlNormalform(nexturlstring);                                    
+                                    
                                    // generating an url object
                                    URL nexturlURL = null;
                                    try {
--- a/source/de/anomic/plasma/plasmaCrawlLoader.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoader.java
@ -55,6 +55,8 @@ import org.apache.commons.pool.impl.GenericObjectPool;

 public final class plasmaCrawlLoader extends Thread {

+    static plasmaSwitchboard switchboard;
+    
    private final plasmaHTCache   cacheManager;
    private final int             socketTimeout;
    private final serverLog       log;   
@ -66,7 +68,6 @@ public final class plasmaCrawlLoader extends Thread {
    private boolean stopped = false;
    
    public plasmaCrawlLoader(
-            plasmaSwitchboard sb,
            plasmaHTCache cacheManager, 
            serverLog log) {
        
@ -75,7 +76,7 @@ public final class plasmaCrawlLoader extends Thread {
    	this.cacheManager    = cacheManager;
    	this.log             = log;
        
-    	this.socketTimeout   = Integer.parseInt(sb.getConfig("clientTimeout", "10000"));
+    	this.socketTimeout   = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000"));
        
        // configuring the crawler messagequeue
        this.theQueue = new CrawlerMessageQueue();
@ -86,12 +87,12 @@ public final class plasmaCrawlLoader extends Thread {
        
        // The maximum number of active connections that can be allocated from pool at the same time,
        // 0 for no limit
-        this.cralwerPoolConfig.maxActive = Integer.parseInt(sb.getConfig("crawlerMaxActiveThreads","10"));
+        this.cralwerPoolConfig.maxActive = Integer.parseInt(switchboard.getConfig("crawlerMaxActiveThreads","10"));
        
        // The maximum number of idle connections connections in the pool
        // 0 = no limit.        
-        this.cralwerPoolConfig.maxIdle = Integer.parseInt(sb.getConfig("crawlerMaxIdleThreads","7"));
-        this.cralwerPoolConfig.minIdle = Integer.parseInt(sb.getConfig("crawlerMinIdleThreads","5"));    
+        this.cralwerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawlerMaxIdleThreads","7"));
+        this.cralwerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawlerMinIdleThreads","5"));    
        
        // block undefinitely 
        this.cralwerPoolConfig.maxWait = -1; 
@ -106,9 +107,9 @@ public final class plasmaCrawlLoader extends Thread {
                this.theThreadGroup,
                cacheManager,
                socketTimeout,
-                sb.getConfig("remoteProxyUse","false").equals("true"),
-                sb.getConfig("remoteProxyHost",""),
-                Integer.parseInt(sb.getConfig("remoteProxyPort","3128")),
+                switchboard.getConfig("remoteProxyUse","false").equals("true"),
+                switchboard.getConfig("remoteProxyHost",""),
+                Integer.parseInt(switchboard.getConfig("remoteProxyPort","3128")),
                log);
        
        this.crawlwerPool = new CrawlerPool(theFactory,this.cralwerPoolConfig,this.theThreadGroup);        
--- a/source/de/anomic/plasma/plasmaCrawlWorker.java
+++ b/source/de/anomic/plasma/plasmaCrawlWorker.java
@ -365,8 +365,15 @@ public final class plasmaCrawlWorker extends Thread {
            } else if (res.status.startsWith("30")) {
                if (crawlingRetryCount < 0) {                    
                    if (res.responseHeader.containsKey(httpHeader.LOCATION)) {
-                        // generating the new url
-                        URL redirectionUrl = new URL(url, (String) res.responseHeader.get(httpHeader.LOCATION));
+                        // getting redirection URL
+                        String redirectionUrlString = (String) res.responseHeader.get(httpHeader.LOCATION);
+                        redirectionUrlString = redirectionUrlString.trim();
+                        
+                        // normalizing URL
+                        redirectionUrlString = plasmaParser.urlNormalform(redirectionUrlString);
+                        
+                        // generating the new URL object
+                        URL redirectionUrl = new URL(url, redirectionUrlString);
                        
                        // returning the used httpc
                        httpc.returnInstance(remote); 
@ -382,6 +389,12 @@ public final class plasmaCrawlWorker extends Thread {
                            return;
                        }                        
                        
+                        // generating url hash 
+                        String urlhash = plasmaURL.urlHash(redirectionUrl);
+
+                        // removing url from loader queue
+                        plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash);                        
+                        
                        // retry crawling with new url
                        load(redirectionUrl,
                             name,
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -297,8 +297,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        catch (NumberFormatException e) { remoteport = 3128; }
        
        crawlSlots = Integer.parseInt(getConfig("crawlerMaxActiveThreads", "10"));
+        plasmaCrawlLoader.switchboard = this;
        this.cacheLoader = new plasmaCrawlLoader(
-                this,
                this.cacheManager, 
                this.log);