From 6f49ece22f910c94066ff5736d3dd75577c0cc37 Mon Sep 17 00:00:00 2001 From: luccioman Date: Thu, 20 Oct 2016 12:12:26 +0200 Subject: [PATCH] Fixed redirected URLs processing as crawl start point. See mantis 699 (http://mantis.tokeek.de/view.php?id=699) for details. --- .../yacy/crawler/retrieval/HTTPLoader.java | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 09a6c9c3d..3022bcad0 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -162,8 +162,20 @@ public final class HTTPLoader { if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { // put redirect url on the crawler queue to repeat a // double-check - request.redirectURL(redirectionUrl); - this.sb.crawlStacker.stackCrawl(request); + /* We have to clone the request instance and not to modify directly its URL, + * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ + Request redirectedRequest = new Request(request.initiator(), + redirectionUrl, + request.referrerhash(), + request.name(), + request.appdate(), + request.profileHandle(), + request.depth(), + request.timezoneOffset()); + String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); + if(rejectReason != null) { + throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); + } // in the end we must throw an exception (even if this is // not an error, just to abort the current process throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " @@ -349,10 +361,24 @@ public final class HTTPLoader { // we have two use cases here: loading from a crawl or just loading the url. Check this: if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { // put redirect url on the crawler queue to repeat a double-check - request.redirectURL(redirectionUrl); - this.sb.crawlStacker.stackCrawl(request); + /* We have to clone the request instance and not to modify directly its URL, + * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ + Request redirectedRequest = new Request(request.initiator(), + redirectionUrl, + request.referrerhash(), + request.name(), + request.appdate(), + request.profileHandle(), + request.depth(), + request.timezoneOffset()); + String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); // in the end we must throw an exception (even if this is not an error, just to abort the current process + if(rejectReason != null) { + throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); + } throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); + + } // if we are already doing a shutdown we don't need to retry crawling