bugfix for crawler double-check: if an url is redirected, the

redirect-target was not double-checked. This is now done by replacing the redirect-URL on the crawl queue again (where it is double-checked)
11 years ago · eca9380e3d
parent 9ac0c93f17
commit eca9380e3d
2 changed files with 12 additions and 2 deletions
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -34,6 +34,7 @@ import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.Latency;
 import net.yacy.kelondro.io.ByteCount;
@ -158,10 +159,19 @@ public final class HTTPLoader {
            }
    	    if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
    	        // we have two use cases here: loading from a crawl or just loading the url. Check this:
    	        if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
                    // put redirect url on the crawler queue to repeat a double-check
                    request.redirectURL(redirectionUrl);
    	            this.sb.crawlStacker.stackCrawl(request);
    	            // in the end we must throw an exception (even if this is not an error, just to abort the current process
                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
    	        }
                // if we are already doing a shutdown we don't need to retry crawling
                if (Thread.currentThread().isInterrupted()) {
                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
-                    throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.$");
+                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
                }
                // retry crawling with new url