increased redirect depth by one

this makes sense if one redirect replaces http with https and another
replaces www subdomain by without (and vice versa)
pull/402/head
Michael Peter Christen 4 years ago
parent d0abb0cedb
commit 9be36800a4

@ -98,14 +98,13 @@ public final class HTTPLoader {
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when an error occurred
*/
public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount,
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
throws IOException {
public StreamResponse openInputStream(
final Request request, CrawlProfile profile, final int retryCount,
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent
) throws IOException {
if (retryCount < 0) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException(
"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
}
DigestURL url = request.url();
@ -158,8 +157,7 @@ public final class HTTPLoader {
if (statusCode > 299 && statusCode < 310) {
client.finish();
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline,
responseHeader, requestURLString);
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// we have two use cases here: loading from a crawl or just
@ -196,15 +194,20 @@ public final class HTTPLoader {
"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
}
// check if the redirected URL is the same as the requested URL
// this shortcuts a time-out using retryCount
if (redirectionUrl.equals(url)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1);
throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
}
// retry crawling with new url
request.redirectURL(redirectionUrl);
return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
}
// we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$");
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$");
} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
// the transfer is ok
@ -397,8 +400,6 @@ public final class HTTPLoader {
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
}
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
}
// if we are already doing a shutdown we don't need to retry crawling

@ -399,7 +399,7 @@ public final class LoaderDispatcher {
// load resource from the internet
StreamResponse response;
if (protocol.equals("http") || protocol.equals("https")) {
response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent);
} else if (protocol.equals("ftp")) {
response = this.ftpLoader.openInputStream(request, true);
} else if (protocol.equals("smb")) {

Loading…
Cancel
Save