increased redirect depth by one

this makes sense if one redirect replaces http with https and another replaces www subdomain by without (and vice versa)
4 years ago · 9be36800a4
parent d0abb0cedb
commit 9be36800a4
2 changed files with 262 additions and 261 deletions
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -86,7 +86,7 @@ public final class HTTPLoader {
        return doc;
    }
-	/**
+    /**
     * Open an input stream on a requested HTTP resource. When the resource content size is small 
     * (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance.
     * @param request
@ -98,228 +98,231 @@ public final class HTTPLoader {
     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
     * @throws IOException when an error occurred
     */
-	public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount,
+    public StreamResponse openInputStream(
-			final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
+            final Request request, CrawlProfile profile, final int retryCount,
-					throws IOException {
+            final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent
-		if (retryCount < 0) {
+        ) throws IOException {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+        if (retryCount < 0) {
-					FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
-			throw new IOException(
+            throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
-					"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
+        }
-		}
+        DigestURL url = request.url();
-		DigestURL url = request.url();
+
-
+        final String host = url.getHost();
-		final String host = url.getHost();
+        if (host == null || host.length() < 2) {
-		if (host == null || host.length() < 2) {
+            throw new IOException("host is not well-formed: '" + host + "'");
-			throw new IOException("host is not well-formed: '" + host + "'");
+        }
-		}
+        final String path = url.getFile();
-		final String path = url.getFile();
+        int port = url.getPort();
-		int port = url.getPort();
+        final boolean ssl = url.getProtocol().equals("https");
-		final boolean ssl = url.getProtocol().equals("https");
+        if (port < 0)
-		if (port < 0)
+            port = (ssl) ? 443 : 80;
-			port = (ssl) ? 443 : 80;
+
-
+        // check if url is in blacklist
-		// check if url is in blacklist
+        final String hostlow = host.toLowerCase(Locale.ROOT);
-		final String hostlow = host.toLowerCase(Locale.ROOT);
+        if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
-		if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
+                    "url in blacklist", -1);
-					"url in blacklist", -1);
+            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
-			throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
+        }
-		}
+
-
+        // resolve yacy and yacyh domains
-		// resolve yacy and yacyh domains
+        final AlternativeDomainNames yacyResolver = this.sb.peers;
-		final AlternativeDomainNames yacyResolver = this.sb.peers;
+        if (yacyResolver != null) {
-		if (yacyResolver != null) {
+            final String yAddress = yacyResolver.resolve(host);
-			final String yAddress = yacyResolver.resolve(host);
+            if (yAddress != null) {
-			if (yAddress != null) {
+                url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
-				url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
+            }
-			}
+        }
-		}
+
-
+        // create a request header
-		// create a request header
+        final RequestHeader requestHeader = createRequestheader(request, agent);
-		final RequestHeader requestHeader = createRequestheader(request, agent);
+
-
+        // HTTP-Client
-		// HTTP-Client
+        final HTTPClient client = new HTTPClient(agent);
-		final HTTPClient client = new HTTPClient(agent);
+        client.setRedirecting(false); // we want to handle redirection
-		client.setRedirecting(false); // we want to handle redirection
+                                        // ourselves, so we don't index pages
-										// ourselves, so we don't index pages
+                                        // twice
-										// twice
+        client.setTimout(this.socketTimeout);
-		client.setTimout(this.socketTimeout);
+        client.setHeader(requestHeader.entrySet());
-		client.setHeader(requestHeader.entrySet());
+
-
+        // send request
-		// send request
+        client.GET(url, false);
-		client.GET(url, false);
+        final StatusLine statusline = client.getHttpResponse().getStatusLine();
-		final StatusLine statusline = client.getHttpResponse().getStatusLine();
+        final int statusCode = statusline.getStatusCode();
-		final int statusCode = statusline.getStatusCode();
+        final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
-		final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
+        String requestURLString = request.url().toNormalform(true);
-		String requestURLString = request.url().toNormalform(true);
+
-
+        // check redirection
-		// check redirection
+        if (statusCode > 299 && statusCode < 310) {
-		if (statusCode > 299 && statusCode < 310) {
+            client.finish();
-			client.finish();
+
-			
+            final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString);
-			final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline,
+
-					responseHeader, requestURLString);
+            if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
-
+                // we have two use cases here: loading from a crawl or just
-			if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
+                // loading the url. Check this:
-				// we have two use cases here: loading from a crawl or just
+                if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
-				// loading the url. Check this:
+                    // put redirect url on the crawler queue to repeat a
-				if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
+                    // double-check
-					// put redirect url on the crawler queue to repeat a
+                    /* We have to clone the request instance and not to modify directly its URL, 
-					// double-check
+                     * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
    	        	/* We have to clone the request instance and not to modify directly its URL, 
    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
                    Request redirectedRequest = new Request(request.initiator(),
-                    		redirectionUrl,
+                            redirectionUrl,
-                    		request.referrerhash(),
+                            request.referrerhash(),
-                    		request.name(),
+                            request.name(),
-                    		request.appdate(),
+                            request.appdate(),
-                    		request.profileHandle(),
+                            request.profileHandle(),
-                    		request.depth(),
+                            request.depth(),
-                    		request.timezoneOffset());
+                            request.timezoneOffset());
-    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
+                    String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
-    	            if(rejectReason != null) {
+                    if(rejectReason != null) {
                        throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
-    	            }
+                    }
-					// in the end we must throw an exception (even if this is
+                    // in the end we must throw an exception (even if this is
-					// not an error, just to abort the current process
+                    // not an error, just to abort the current process
-					throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
+                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
-							+ redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
+                            + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
-				}
+                }
-
+
-				// if we are already doing a shutdown we don't need to retry
+                // if we are already doing a shutdown we don't need to retry
-				// crawling
+                // crawling
-				if (Thread.currentThread().isInterrupted()) {
+                if (Thread.currentThread().isInterrupted()) {
-					this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-							FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
+                            FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
-					throw new IOException(
+                    throw new IOException(
-							"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
+                            "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
-				}
+                }
-
+
-				// retry crawling with new url
+                // check if the redirected URL is the same as the requested URL
-				request.redirectURL(redirectionUrl);
+                // this shortcuts a time-out using retryCount
-				return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
+                if (redirectionUrl.equals(url)) {
-			}
+                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1);
-			// we don't want to follow redirects
+                    throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                }
-					FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
+
-			throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
+                // retry crawling with new url
-					+ "' for URL '" + requestURLString + "'$");
+                request.redirectURL(redirectionUrl);
-		} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
+                return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
-			// the transfer is ok
+            }
-
+            // we don't want to follow redirects
-			/*
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
-			 * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
+            throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$");
-			 */
+        } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
-			long contentLength = client.getHttpResponse().getEntity().getContentLength();
+            // the transfer is ok
-			InputStream contentStream;
+
-			if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
+            /*
-				byte[] content = null;
+             * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
-				try {
+             */
-					content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
+            long contentLength = client.getHttpResponse().getEntity().getContentLength();
-					Cache.store(url, responseHeader, content);
+            InputStream contentStream;
-				} catch (final IOException e) {
+            if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
-					this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
+                byte[] content = null;
-				} finally {
+                try {
-					client.finish();
+                    content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
-				}
+                    Cache.store(url, responseHeader, content);
-
+                } catch (final IOException e) {
-				contentStream = new ByteArrayInputStream(content);
+                    this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
-			} else {
+                } finally {
-				/*
+                    client.finish();
-				 * Content length may already be known now : check it before opening a stream
+                }
-				 */
+
-				if (maxFileSize >= 0 && contentLength > maxFileSize) {
+                contentStream = new ByteArrayInputStream(content);
-					throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
+            } else {
-				}
+                /*
-				/*
+                 * Content length may already be known now : check it before opening a stream
-				 * Create a HTTPInputStream delegating to
+                 */
-				 * client.getContentstream(). Close method will ensure client is
+                if (maxFileSize >= 0 && contentLength > maxFileSize) {
-				 * properly closed.
+                    throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
-				 */
+                }
-				contentStream = new HTTPInputStream(client);
+                /*
-				/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
+                 * Create a HTTPInputStream delegating to
-				if(maxFileSize >= 0) {
+                 * client.getContentstream(). Close method will ensure client is
-					contentStream = new StrictLimitInputStream(contentStream, maxFileSize,
+                 * properly closed.
-							"Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize));
+                 */
-				}
+                contentStream = new HTTPInputStream(client);
-			}
+                /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
-
+                if(maxFileSize >= 0) {
-			return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
+                    contentStream = new StrictLimitInputStream(contentStream, maxFileSize,
-		} else {
+                            "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize));
-			client.finish();
+                }
-			// if the response has not the right response type then reject file
+            }
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+
-					FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
+            return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
-			throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline
+        } else {
-					+ "' for URL '" + requestURLString + "'$");
+            client.finish();
-		}
+            // if the response has not the right response type then reject file
-	}
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-
+                    FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
-	/**
+            throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline
-	 * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
+                    + "' for URL '" + requestURLString + "'$");
-	 * @return redirect URL
+        }
-	 * @throws IOException when an error occured
+    }
-	 */
+
-	private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
+    /**
-			final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString)
+     * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
-					throws IOException {
+     * @return redirect URL
-		// read redirection URL
+     * @throws IOException when an error occured
-		String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
+     */
-		redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
+    private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
-
+            final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString)
-		if (redirectionUrlString.isEmpty()) {
+                    throws IOException {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+        // read redirection URL
-					FailCategory.TEMPORARY_NETWORK_FAILURE,
+        String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
-					"no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode());
+        redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
-			throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
+
-					+ "' for URL '" + requestURLString + "'$");
+        if (redirectionUrlString.isEmpty()) {
-		}
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-
+                    FailCategory.TEMPORARY_NETWORK_FAILURE,
-		// normalize URL
+                    "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode());
-		final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
+            throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
-
+                    + "' for URL '" + requestURLString + "'$");
-		// restart crawling with new url
+        }
-		this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
+
-				+ requestURLString);
+        // normalize URL
-		this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
+        final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
-
+
-		this.sb.webStructure.generateCitationReference(url, redirectionUrl);
+        // restart crawling with new url
-
+        this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
-		if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
+                + requestURLString);
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+        this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
-					FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
+
-		}
+        this.sb.webStructure.generateCitationReference(url, redirectionUrl);
-		return redirectionUrl;
+
-	}
+        if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
-
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-	/**
+                    FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
-	 * Create request header for loading content.
+        }
-	 * @param request search request
+        return redirectionUrl;
-	 * @param agent agent identification information
+    }
-	 * @return a request header
+
-	 * @throws IOException when an error occured
+    /**
-	 */
+     * Create request header for loading content.
-	private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
+     * @param request search request
-			throws IOException {
+     * @param agent agent identification information
-		final RequestHeader requestHeader = new RequestHeader();
+     * @return a request header
-		requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
+     * @throws IOException when an error occured
-		if (request.referrerhash() != null) {
+     */
    private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
            throws IOException {
        final RequestHeader requestHeader = new RequestHeader();
        requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
        if (request.referrerhash() != null) {
                    DigestURL refererURL = this.sb.getURL(request.referrerhash());
                    if (refererURL != null) {
                        requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
                    }
-		}
+        }
-
+
-		requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
+        requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
-		requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
+        requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
-				this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
+                this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
-		requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
+        requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
-				this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
+                this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
-		requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
+        requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
-				this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
+                this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
-		return requestHeader;
+        return requestHeader;
-	}
+    }
    private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
@ -347,10 +350,10 @@ public final class HTTPLoader {
        // resolve yacy and yacyh domains
        final AlternativeDomainNames yacyResolver = this.sb.peers;
        if(yacyResolver != null) {
-        	final String yAddress = yacyResolver.resolve(host);
+            final String yAddress = yacyResolver.resolve(host);
-        	if(yAddress != null) {
+            if(yAddress != null) {
-        		url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
+                url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
-        	}
+            }
        }
        // take a file from the net
@ -366,40 +369,38 @@ public final class HTTPLoader {
        client.setHeader(requestHeader.entrySet());
        // send request
-    	final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false);
+        final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false);
        final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
-    	final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
+        final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
        String requestURLString = request.url().toNormalform(true);
        // check redirection
-    	if (statusCode > 299 && statusCode < 310) {
+        if (statusCode > 299 && statusCode < 310) {
-    	    final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(),
+            final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(),
-					responseHeader, requestURLString);
+                    responseHeader, requestURLString);
-    	    if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
+            if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
-    	        // we have two use cases here: loading from a crawl or just loading the url. Check this:
+                // we have two use cases here: loading from a crawl or just loading the url. Check this:
-    	        if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
+                if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
                    // put redirect url on the crawler queue to repeat a double-check
-    	        	/* We have to clone the request instance and not to modify directly its URL, 
+                    /* We have to clone the request instance and not to modify directly its URL, 
-    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
+                     * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
                    Request redirectedRequest = new Request(request.initiator(),
-                    		redirectionUrl,
+                            redirectionUrl,
-                    		request.referrerhash(),
+                            request.referrerhash(),
-                    		request.name(),
+                            request.name(),
-                    		request.appdate(),
+                            request.appdate(),
-                    		request.profileHandle(),
+                            request.profileHandle(),
-                    		request.depth(),
+                            request.depth(),
-                    		request.timezoneOffset());
+                            request.timezoneOffset());
-    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
+                    String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
-    	            // in the end we must throw an exception (even if this is not an error, just to abort the current process
+                    // in the end we must throw an exception (even if this is not an error, just to abort the current process
-    	            if(rejectReason != null) {
+                    if(rejectReason != null) {
                        throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
-    	            }
+                    }
                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
-    	            
+                }
    	        }
                // if we are already doing a shutdown we don't need to retry crawling
                if (Thread.currentThread().isInterrupted()) {
@ -410,15 +411,15 @@ public final class HTTPLoader {
                // retry crawling with new url
                request.redirectURL(redirectionUrl);
                return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
-    	    }
+            }
            // we don't want to follow redirects
            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
            throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
        } else if (responseBody == null) {
-    	    // no response, reject file
+            // no response, reject file
            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
-    	} else if (statusCode == 200 || statusCode == 203) {
+        } else if (statusCode == 200 || statusCode == 203) {
            // the transfer is ok
            // we write the new cache entry to file system directly
@ -427,8 +428,8 @@ public final class HTTPLoader {
            // check length again in case it was not possible to get the length before loading
            if (maxFileSize >= 0 && contentLength > maxFileSize) {
-            	this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
+                this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
-            	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
+                throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
            }
            // create a new cache entry
@ -442,9 +443,9 @@ public final class HTTPLoader {
            );
            return response;
-    	} else {
+        } else {
            // if the response has not the right response type then reject file
-        	this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
        }
    }
@ -485,17 +486,17 @@ public final class HTTPLoader {
        final HTTPClient client = new HTTPClient(agent);
        client.setTimout(20000);
        client.setHeader(requestHeader.entrySet());
-        	final byte[] responseBody = client.GETbytes(request.url(), null, null, false);
+            final byte[] responseBody = client.GETbytes(request.url(), null, null, false);
            final int code = client.getHttpResponse().getStatusLine().getStatusCode();
-        	final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
+            final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
            // FIXME: 30*-handling (bottom) is never reached
            // we always get the final content because httpClient.followRedirects = true
-        	if (responseBody != null && (code == 200 || code == 203)) {
+            if (responseBody != null && (code == 200 || code == 203)) {
                // the transfer is ok
-        		//statistics:
+                //statistics:
-        		ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
+                ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
                // we write the new cache entry to file system directly
@ -513,7 +514,7 @@ public final class HTTPLoader {
            } else if (code > 299 && code < 310) {
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
-                	String redirectionUrlString = header.get(HeaderFramework.LOCATION);
+                    String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();
                    if (redirectionUrlString.isEmpty()) {
@ -535,7 +536,7 @@ public final class HTTPLoader {
                }
            } else {
                // if the response has not the right response type then reject file
-            	throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
+                throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
            }
        return response;
    }
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -399,7 +399,7 @@ public final class LoaderDispatcher {
        // load resource from the internet
        StreamResponse response;
        if (protocol.equals("http") || protocol.equals("https")) {
-        	response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
+        	response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent);
        } else if (protocol.equals("ftp")) {
        	response = this.ftpLoader.openInputStream(request, true);
        } else if (protocol.equals("smb")) {