From 6f49ece22f910c94066ff5736d3dd75577c0cc37 Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Thu, 20 Oct 2016 12:12:26 +0200
Subject: [PATCH] Fixed redirected URLs processing as crawl start point.

See mantis 699 (http://mantis.tokeek.de/view.php?id=699) for details.
---
 .../yacy/crawler/retrieval/HTTPLoader.java    | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java
index 09a6c9c3d..3022bcad0 100644
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@@ -162,8 +162,20 @@ public final class HTTPLoader {
 				if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
 					// put redirect url on the crawler queue to repeat a
 					// double-check
-					request.redirectURL(redirectionUrl);
-					this.sb.crawlStacker.stackCrawl(request);
+    	        	/* We have to clone the request instance and not to modify directly its URL, 
+    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
+                    Request redirectedRequest = new Request(request.initiator(),
+                    		redirectionUrl,
+                    		request.referrerhash(),
+                    		request.name(),
+                    		request.appdate(),
+                    		request.profileHandle(),
+                    		request.depth(),
+                    		request.timezoneOffset());
+    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
+    	            if(rejectReason != null) {
+                        throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
+    	            }
 					// in the end we must throw an exception (even if this is
 					// not an error, just to abort the current process
 					throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
@@ -349,10 +361,24 @@ public final class HTTPLoader {
     	        // we have two use cases here: loading from a crawl or just loading the url. Check this:
     	        if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
                     // put redirect url on the crawler queue to repeat a double-check
-                    request.redirectURL(redirectionUrl);
-    	            this.sb.crawlStacker.stackCrawl(request);
+    	        	/* We have to clone the request instance and not to modify directly its URL, 
+    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
+                    Request redirectedRequest = new Request(request.initiator(),
+                    		redirectionUrl,
+                    		request.referrerhash(),
+                    		request.name(),
+                    		request.appdate(),
+                    		request.profileHandle(),
+                    		request.depth(),
+                    		request.timezoneOffset());
+    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
     	            // in the end we must throw an exception (even if this is not an error, just to abort the current process
+    	            if(rejectReason != null) {
+                        throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
+    	            }
                     throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
+    	            
+
     	        }
     	        
                 // if we are already doing a shutdown we don't need to retry crawling