From 9be36800a4dc0d2c816c47084e34146a6cef0177 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 20 Dec 2020 19:44:16 +0100
Subject: [PATCH] increased redirect depth by one this makes sense if one
 redirect replaces http with https and another replaces www subdomain by
 without (and vice versa)

---
 .../yacy/crawler/retrieval/HTTPLoader.java    | 521 +++++++++---------
 .../net/yacy/repository/LoaderDispatcher.java |   2 +-
 2 files changed, 262 insertions(+), 261 deletions(-)

diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java
index 1ba2d793e..0703bc85e 100644
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@@ -85,8 +85,8 @@ public final class HTTPLoader {
         Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
         return doc;
     }
-    
-	/**
+
+    /**
      * Open an input stream on a requested HTTP resource. When the resource content size is small 
      * (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance.
      * @param request
@@ -98,228 +98,231 @@ public final class HTTPLoader {
      * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
      * @throws IOException when an error occurred
      */
-	public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount,
-			final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
-					throws IOException {
-		if (retryCount < 0) {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
-			throw new IOException(
-					"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
-		}
-		DigestURL url = request.url();
-
-		final String host = url.getHost();
-		if (host == null || host.length() < 2) {
-			throw new IOException("host is not well-formed: '" + host + "'");
-		}
-		final String path = url.getFile();
-		int port = url.getPort();
-		final boolean ssl = url.getProtocol().equals("https");
-		if (port < 0)
-			port = (ssl) ? 443 : 80;
-
-		// check if url is in blacklist
-		final String hostlow = host.toLowerCase(Locale.ROOT);
-		if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
-					"url in blacklist", -1);
-			throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
-		}
-
-		// resolve yacy and yacyh domains
-		final AlternativeDomainNames yacyResolver = this.sb.peers;
-		if (yacyResolver != null) {
-			final String yAddress = yacyResolver.resolve(host);
-			if (yAddress != null) {
-				url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
-			}
-		}
-
-		// create a request header
-		final RequestHeader requestHeader = createRequestheader(request, agent);
-
-		// HTTP-Client
-		final HTTPClient client = new HTTPClient(agent);
-		client.setRedirecting(false); // we want to handle redirection
-										// ourselves, so we don't index pages
-										// twice
-		client.setTimout(this.socketTimeout);
-		client.setHeader(requestHeader.entrySet());
-
-		// send request
-		client.GET(url, false);
-		final StatusLine statusline = client.getHttpResponse().getStatusLine();
-		final int statusCode = statusline.getStatusCode();
-		final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
-		String requestURLString = request.url().toNormalform(true);
-
-		// check redirection
-		if (statusCode > 299 && statusCode < 310) {
-			client.finish();
-			
-			final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline,
-					responseHeader, requestURLString);
-
-			if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
-				// we have two use cases here: loading from a crawl or just
-				// loading the url. Check this:
-				if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
-					// put redirect url on the crawler queue to repeat a
-					// double-check
-    	        	/* We have to clone the request instance and not to modify directly its URL, 
-    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
+    public StreamResponse openInputStream(
+            final Request request, CrawlProfile profile, final int retryCount,
+            final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent
+        ) throws IOException {
+        if (retryCount < 0) {
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
+            throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
+        }
+        DigestURL url = request.url();
+
+        final String host = url.getHost();
+        if (host == null || host.length() < 2) {
+            throw new IOException("host is not well-formed: '" + host + "'");
+        }
+        final String path = url.getFile();
+        int port = url.getPort();
+        final boolean ssl = url.getProtocol().equals("https");
+        if (port < 0)
+            port = (ssl) ? 443 : 80;
+
+        // check if url is in blacklist
+        final String hostlow = host.toLowerCase(Locale.ROOT);
+        if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
+                    "url in blacklist", -1);
+            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
+        }
+
+        // resolve yacy and yacyh domains
+        final AlternativeDomainNames yacyResolver = this.sb.peers;
+        if (yacyResolver != null) {
+            final String yAddress = yacyResolver.resolve(host);
+            if (yAddress != null) {
+                url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
+            }
+        }
+
+        // create a request header
+        final RequestHeader requestHeader = createRequestheader(request, agent);
+
+        // HTTP-Client
+        final HTTPClient client = new HTTPClient(agent);
+        client.setRedirecting(false); // we want to handle redirection
+                                        // ourselves, so we don't index pages
+                                        // twice
+        client.setTimout(this.socketTimeout);
+        client.setHeader(requestHeader.entrySet());
+
+        // send request
+        client.GET(url, false);
+        final StatusLine statusline = client.getHttpResponse().getStatusLine();
+        final int statusCode = statusline.getStatusCode();
+        final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
+        String requestURLString = request.url().toNormalform(true);
+
+        // check redirection
+        if (statusCode > 299 && statusCode < 310) {
+            client.finish();
+
+            final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString);
+
+            if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
+                // we have two use cases here: loading from a crawl or just
+                // loading the url. Check this:
+                if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
+                    // put redirect url on the crawler queue to repeat a
+                    // double-check
+                    /* We have to clone the request instance and not to modify directly its URL, 
+                     * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
                     Request redirectedRequest = new Request(request.initiator(),
-                    		redirectionUrl,
-                    		request.referrerhash(),
-                    		request.name(),
-                    		request.appdate(),
-                    		request.profileHandle(),
-                    		request.depth(),
-                    		request.timezoneOffset());
-    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
-    	            if(rejectReason != null) {
+                            redirectionUrl,
+                            request.referrerhash(),
+                            request.name(),
+                            request.appdate(),
+                            request.profileHandle(),
+                            request.depth(),
+                            request.timezoneOffset());
+                    String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
+                    if(rejectReason != null) {
                         throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
-    	            }
-					// in the end we must throw an exception (even if this is
-					// not an error, just to abort the current process
-					throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
-							+ redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
-				}
-
-				// if we are already doing a shutdown we don't need to retry
-				// crawling
-				if (Thread.currentThread().isInterrupted()) {
-					this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-							FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
-					throw new IOException(
-							"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
-				}
-
-				// retry crawling with new url
-				request.redirectURL(redirectionUrl);
-				return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
-			}
-			// we don't want to follow redirects
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
-			throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
-					+ "' for URL '" + requestURLString + "'$");
-		} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
-			// the transfer is ok
-
-			/*
-			 * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
-			 */
-			long contentLength = client.getHttpResponse().getEntity().getContentLength();
-			InputStream contentStream;
-			if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
-				byte[] content = null;
-				try {
-					content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
-					Cache.store(url, responseHeader, content);
-				} catch (final IOException e) {
-					this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
-				} finally {
-					client.finish();
-				}
-
-				contentStream = new ByteArrayInputStream(content);
-			} else {
-				/*
-				 * Content length may already be known now : check it before opening a stream
-				 */
-				if (maxFileSize >= 0 && contentLength > maxFileSize) {
-					throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
-				}
-				/*
-				 * Create a HTTPInputStream delegating to
-				 * client.getContentstream(). Close method will ensure client is
-				 * properly closed.
-				 */
-				contentStream = new HTTPInputStream(client);
-				/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
-				if(maxFileSize >= 0) {
-					contentStream = new StrictLimitInputStream(contentStream, maxFileSize,
-							"Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize));
-				}
-			}
-
-			return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
-		} else {
-			client.finish();
-			// if the response has not the right response type then reject file
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
-			throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline
-					+ "' for URL '" + requestURLString + "'$");
-		}
-	}
-
-	/**
-	 * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
-	 * @return redirect URL
-	 * @throws IOException when an error occured
-	 */
-	private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
-			final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString)
-					throws IOException {
-		// read redirection URL
-		String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
-		redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
-
-		if (redirectionUrlString.isEmpty()) {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.TEMPORARY_NETWORK_FAILURE,
-					"no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode());
-			throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
-					+ "' for URL '" + requestURLString + "'$");
-		}
-
-		// normalize URL
-		final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
-
-		// restart crawling with new url
-		this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
-				+ requestURLString);
-		this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
-
-		this.sb.webStructure.generateCitationReference(url, redirectionUrl);
-
-		if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
-		}
-		return redirectionUrl;
-	}
-
-	/**
-	 * Create request header for loading content.
-	 * @param request search request
-	 * @param agent agent identification information
-	 * @return a request header
-	 * @throws IOException when an error occured
-	 */
-	private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
-			throws IOException {
-		final RequestHeader requestHeader = new RequestHeader();
-		requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
-		if (request.referrerhash() != null) {
+                    }
+                    // in the end we must throw an exception (even if this is
+                    // not an error, just to abort the current process
+                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
+                            + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
+                }
+
+                // if we are already doing a shutdown we don't need to retry
+                // crawling
+                if (Thread.currentThread().isInterrupted()) {
+                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                            FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
+                    throw new IOException(
+                            "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
+                }
+
+                // check if the redirected URL is the same as the requested URL
+                // this shortcuts a time-out using retryCount
+                if (redirectionUrl.equals(url)) {
+                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1);
+                    throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
+                }
+
+                // retry crawling with new url
+                request.redirectURL(redirectionUrl);
+                return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
+            }
+            // we don't want to follow redirects
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
+            throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$");
+        } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
+            // the transfer is ok
+
+            /*
+             * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
+             */
+            long contentLength = client.getHttpResponse().getEntity().getContentLength();
+            InputStream contentStream;
+            if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
+                byte[] content = null;
+                try {
+                    content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
+                    Cache.store(url, responseHeader, content);
+                } catch (final IOException e) {
+                    this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
+                } finally {
+                    client.finish();
+                }
+
+                contentStream = new ByteArrayInputStream(content);
+            } else {
+                /*
+                 * Content length may already be known now : check it before opening a stream
+                 */
+                if (maxFileSize >= 0 && contentLength > maxFileSize) {
+                    throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
+                }
+                /*
+                 * Create a HTTPInputStream delegating to
+                 * client.getContentstream(). Close method will ensure client is
+                 * properly closed.
+                 */
+                contentStream = new HTTPInputStream(client);
+                /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
+                if(maxFileSize >= 0) {
+                    contentStream = new StrictLimitInputStream(contentStream, maxFileSize,
+                            "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize));
+                }
+            }
+
+            return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
+        } else {
+            client.finish();
+            // if the response has not the right response type then reject file
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                    FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
+            throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline
+                    + "' for URL '" + requestURLString + "'$");
+        }
+    }
+
+    /**
+     * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
+     * @return redirect URL
+     * @throws IOException when an error occured
+     */
+    private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
+            final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString)
+                    throws IOException {
+        // read redirection URL
+        String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
+        redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
+
+        if (redirectionUrlString.isEmpty()) {
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                    FailCategory.TEMPORARY_NETWORK_FAILURE,
+                    "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode());
+            throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
+                    + "' for URL '" + requestURLString + "'$");
+        }
+
+        // normalize URL
+        final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
+
+        // restart crawling with new url
+        this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
+                + requestURLString);
+        this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
+
+        this.sb.webStructure.generateCitationReference(url, redirectionUrl);
+
+        if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                    FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
+        }
+        return redirectionUrl;
+    }
+
+    /**
+     * Create request header for loading content.
+     * @param request search request
+     * @param agent agent identification information
+     * @return a request header
+     * @throws IOException when an error occured
+     */
+    private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
+            throws IOException {
+        final RequestHeader requestHeader = new RequestHeader();
+        requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
+        if (request.referrerhash() != null) {
                     DigestURL refererURL = this.sb.getURL(request.referrerhash());
                     if (refererURL != null) {
                         requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
                     }
-		}
-
-		requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
-		requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
-				this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
-		requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
-				this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
-		requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
-				this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
-		return requestHeader;
-	}
+        }
+
+        requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
+        requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
+                this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
+        requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
+                this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
+        requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
+                this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
+        return requestHeader;
+    }
 
     private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
 
@@ -347,10 +350,10 @@ public final class HTTPLoader {
         // resolve yacy and yacyh domains
         final AlternativeDomainNames yacyResolver = this.sb.peers;
         if(yacyResolver != null) {
-        	final String yAddress = yacyResolver.resolve(host);
-        	if(yAddress != null) {
-        		url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
-        	}
+            final String yAddress = yacyResolver.resolve(host);
+            if(yAddress != null) {
+                url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
+            }
         }
 
         // take a file from the net
@@ -366,41 +369,39 @@ public final class HTTPLoader {
         client.setHeader(requestHeader.entrySet());
 
         // send request
-    	final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false);
+        final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false);
         final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
-    	final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
+        final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
         String requestURLString = request.url().toNormalform(true);
 
         // check redirection
-    	if (statusCode > 299 && statusCode < 310) {
+        if (statusCode > 299 && statusCode < 310) {
 
-    	    final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(),
-					responseHeader, requestURLString);
+            final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(),
+                    responseHeader, requestURLString);
 
-    	    if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
-    	        // we have two use cases here: loading from a crawl or just loading the url. Check this:
-    	        if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
+            if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
+                // we have two use cases here: loading from a crawl or just loading the url. Check this:
+                if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
                     // put redirect url on the crawler queue to repeat a double-check
-    	        	/* We have to clone the request instance and not to modify directly its URL, 
-    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
+                    /* We have to clone the request instance and not to modify directly its URL, 
+                     * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
                     Request redirectedRequest = new Request(request.initiator(),
-                    		redirectionUrl,
-                    		request.referrerhash(),
-                    		request.name(),
-                    		request.appdate(),
-                    		request.profileHandle(),
-                    		request.depth(),
-                    		request.timezoneOffset());
-    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
-    	            // in the end we must throw an exception (even if this is not an error, just to abort the current process
-    	            if(rejectReason != null) {
+                            redirectionUrl,
+                            request.referrerhash(),
+                            request.name(),
+                            request.appdate(),
+                            request.profileHandle(),
+                            request.depth(),
+                            request.timezoneOffset());
+                    String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
+                    // in the end we must throw an exception (even if this is not an error, just to abort the current process
+                    if(rejectReason != null) {
                         throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
-    	            }
+                    }
                     throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
-    	            
+                }
 
-    	        }
-    	        
                 // if we are already doing a shutdown we don't need to retry crawling
                 if (Thread.currentThread().isInterrupted()) {
                     this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
@@ -410,15 +411,15 @@ public final class HTTPLoader {
                 // retry crawling with new url
                 request.redirectURL(redirectionUrl);
                 return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
-    	    }
+            }
             // we don't want to follow redirects
             this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
             throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
         } else if (responseBody == null) {
-    	    // no response, reject file
+            // no response, reject file
             this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
             throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
-    	} else if (statusCode == 200 || statusCode == 203) {
+        } else if (statusCode == 200 || statusCode == 203) {
             // the transfer is ok
 
             // we write the new cache entry to file system directly
@@ -427,8 +428,8 @@ public final class HTTPLoader {
 
             // check length again in case it was not possible to get the length before loading
             if (maxFileSize >= 0 && contentLength > maxFileSize) {
-            	this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
-            	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
+                this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
+                throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
             }
 
             // create a new cache entry
@@ -442,9 +443,9 @@ public final class HTTPLoader {
             );
 
             return response;
-    	} else {
+        } else {
             // if the response has not the right response type then reject file
-        	this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
             throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
         }
     }
@@ -485,17 +486,17 @@ public final class HTTPLoader {
         final HTTPClient client = new HTTPClient(agent);
         client.setTimout(20000);
         client.setHeader(requestHeader.entrySet());
-        	final byte[] responseBody = client.GETbytes(request.url(), null, null, false);
+            final byte[] responseBody = client.GETbytes(request.url(), null, null, false);
             final int code = client.getHttpResponse().getStatusLine().getStatusCode();
-        	final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
+            final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
             // FIXME: 30*-handling (bottom) is never reached
             // we always get the final content because httpClient.followRedirects = true
 
-        	if (responseBody != null && (code == 200 || code == 203)) {
+            if (responseBody != null && (code == 200 || code == 203)) {
                 // the transfer is ok
 
-        		//statistics:
-        		ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
+                //statistics:
+                ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
 
                 // we write the new cache entry to file system directly
 
@@ -513,7 +514,7 @@ public final class HTTPLoader {
             } else if (code > 299 && code < 310) {
                 if (header.containsKey(HeaderFramework.LOCATION)) {
                     // getting redirection URL
-                	String redirectionUrlString = header.get(HeaderFramework.LOCATION);
+                    String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                     redirectionUrlString = redirectionUrlString.trim();
 
                     if (redirectionUrlString.isEmpty()) {
@@ -535,7 +536,7 @@ public final class HTTPLoader {
                 }
             } else {
                 // if the response has not the right response type then reject file
-            	throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
+                throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
             }
         return response;
     }
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index 03feb047d..cccec50c3 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -399,7 +399,7 @@ public final class LoaderDispatcher {
         // load resource from the internet
         StreamResponse response;
         if (protocol.equals("http") || protocol.equals("https")) {
-        	response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
+        	response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent);
         } else if (protocol.equals("ftp")) {
         	response = this.ftpLoader.openInputStream(request, true);
         } else if (protocol.equals("smb")) {