increased redirect depth by one

this makes sense if one redirect replaces http with https and another replaces www subdomain by without (and vice versa)
4 years ago · 9be36800a4
parent d0abb0cedb
commit 9be36800a4
2 changed files with 262 additions and 261 deletions
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -98,14 +98,13 @@ public final class HTTPLoader {
     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
     * @throws IOException when an error occurred
     */
-	public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount,
-			final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
-					throws IOException {
+    public StreamResponse openInputStream(
+            final Request request, CrawlProfile profile, final int retryCount,
+            final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent
+        ) throws IOException {
        if (retryCount < 0) {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
-			throw new IOException(
-					"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
+            throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
        }
        DigestURL url = request.url();

@ -158,8 +157,7 @@ public final class HTTPLoader {
        if (statusCode > 299 && statusCode < 310) {
            client.finish();

-			final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline,
-					responseHeader, requestURLString);
+            final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString);

            if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
                // we have two use cases here: loading from a crawl or just
@ -196,15 +194,20 @@ public final class HTTPLoader {
                            "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
                }

+                // check if the redirected URL is the same as the requested URL
+                // this shortcuts a time-out using retryCount
+                if (redirectionUrl.equals(url)) {
+                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1);
+                    throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
+                }
+
                // retry crawling with new url
                request.redirectURL(redirectionUrl);
                return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
            }
            // we don't want to follow redirects
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
-			throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
-					+ "' for URL '" + requestURLString + "'$");
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
+            throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$");
        } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
            // the transfer is ok

@ -397,8 +400,6 @@ public final class HTTPLoader {
                        throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
                    }
                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
-    	            
-
                }

                // if we are already doing a shutdown we don't need to retry crawling
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -399,7 +399,7 @@ public final class LoaderDispatcher {
        // load resource from the internet
        StreamResponse response;
        if (protocol.equals("http") || protocol.equals("https")) {
-        	response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
+        	response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent);
        } else if (protocol.equals("ftp")) {
        	response = this.ftpLoader.openInputStream(request, true);
        } else if (protocol.equals("smb")) {