bugfix for crawler double-check: if an url is redirected, the

redirect-target was not double-checked. This is now done by replacing the redirect-URL on the crawl queue again (where it is double-checked)
11 years ago · eca9380e3d
parent 9ac0c93f17
commit eca9380e3d
2 changed files with 12 additions and 2 deletions
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -415,7 +415,7 @@ public final class CrawlStacker {
                if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toNormalform(false) + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
                return "crawl stack domain counter exceeded (test by profile)";
            }
-
+            
            /*
            if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) {
                if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -34,6 +34,7 @@ import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.Latency;
 import net.yacy.kelondro.io.ByteCount;
@ -158,10 +159,19 @@ public final class HTTPLoader {
            }

    	    if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
+    	        // we have two use cases here: loading from a crawl or just loading the url. Check this:
+    	        if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
+                    // put redirect url on the crawler queue to repeat a double-check
+                    request.redirectURL(redirectionUrl);
+    	            this.sb.crawlStacker.stackCrawl(request);
+    	            // in the end we must throw an exception (even if this is not an error, just to abort the current process
+                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
+    	        }
+    	        
                // if we are already doing a shutdown we don't need to retry crawling
                if (Thread.currentThread().isInterrupted()) {
                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
-                    throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.$");
+                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
                }

                // retry crawling with new url