increased redirect depth by one

this makes sense if one redirect replaces http with https and another
replaces www subdomain by without (and vice versa)
pull/402/head
Michael Peter Christen 4 years ago
parent d0abb0cedb
commit 9be36800a4

@ -85,8 +85,8 @@ public final class HTTPLoader {
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
return doc; return doc;
} }
/** /**
* Open an input stream on a requested HTTP resource. When the resource content size is small * Open an input stream on a requested HTTP resource. When the resource content size is small
* (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance. * (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance.
* @param request * @param request
@ -98,228 +98,231 @@ public final class HTTPLoader {
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when an error occurred * @throws IOException when an error occurred
*/ */
public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount, public StreamResponse openInputStream(
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) final Request request, CrawlProfile profile, final int retryCount,
throws IOException { final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent
if (retryCount < 0) { ) throws IOException {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, if (retryCount < 0) {
FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException( throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); }
} DigestURL url = request.url();
DigestURL url = request.url();
final String host = url.getHost();
final String host = url.getHost(); if (host == null || host.length() < 2) {
if (host == null || host.length() < 2) { throw new IOException("host is not well-formed: '" + host + "'");
throw new IOException("host is not well-formed: '" + host + "'"); }
} final String path = url.getFile();
final String path = url.getFile(); int port = url.getPort();
int port = url.getPort(); final boolean ssl = url.getProtocol().equals("https");
final boolean ssl = url.getProtocol().equals("https"); if (port < 0)
if (port < 0) port = (ssl) ? 443 : 80;
port = (ssl) ? 443 : 80;
// check if url is in blacklist
// check if url is in blacklist final String hostlow = host.toLowerCase(Locale.ROOT);
final String hostlow = host.toLowerCase(Locale.ROOT); if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
"url in blacklist", -1); throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); }
}
// resolve yacy and yacyh domains
// resolve yacy and yacyh domains final AlternativeDomainNames yacyResolver = this.sb.peers;
final AlternativeDomainNames yacyResolver = this.sb.peers; if (yacyResolver != null) {
if (yacyResolver != null) { final String yAddress = yacyResolver.resolve(host);
final String yAddress = yacyResolver.resolve(host); if (yAddress != null) {
if (yAddress != null) { url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
url = new DigestURL(url.getProtocol() + "://" + yAddress + path); }
} }
}
// create a request header
// create a request header final RequestHeader requestHeader = createRequestheader(request, agent);
final RequestHeader requestHeader = createRequestheader(request, agent);
// HTTP-Client
// HTTP-Client final HTTPClient client = new HTTPClient(agent);
final HTTPClient client = new HTTPClient(agent); client.setRedirecting(false); // we want to handle redirection
client.setRedirecting(false); // we want to handle redirection // ourselves, so we don't index pages
// ourselves, so we don't index pages // twice
// twice client.setTimout(this.socketTimeout);
client.setTimout(this.socketTimeout); client.setHeader(requestHeader.entrySet());
client.setHeader(requestHeader.entrySet());
// send request
// send request client.GET(url, false);
client.GET(url, false); final StatusLine statusline = client.getHttpResponse().getStatusLine();
final StatusLine statusline = client.getHttpResponse().getStatusLine(); final int statusCode = statusline.getStatusCode();
final int statusCode = statusline.getStatusCode(); final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); String requestURLString = request.url().toNormalform(true);
String requestURLString = request.url().toNormalform(true);
// check redirection
// check redirection if (statusCode > 299 && statusCode < 310) {
if (statusCode > 299 && statusCode < 310) { client.finish();
client.finish();
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString);
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline,
responseHeader, requestURLString); if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// we have two use cases here: loading from a crawl or just
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { // loading the url. Check this:
// we have two use cases here: loading from a crawl or just if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
// loading the url. Check this: // put redirect url on the crawler queue to repeat a
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { // double-check
// put redirect url on the crawler queue to repeat a /* We have to clone the request instance and not to modify directly its URL,
// double-check * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
/* We have to clone the request instance and not to modify directly its URL,
* otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
Request redirectedRequest = new Request(request.initiator(), Request redirectedRequest = new Request(request.initiator(),
redirectionUrl, redirectionUrl,
request.referrerhash(), request.referrerhash(),
request.name(), request.name(),
request.appdate(), request.appdate(),
request.profileHandle(), request.profileHandle(),
request.depth(), request.depth(),
request.timezoneOffset()); request.timezoneOffset());
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
if(rejectReason != null) { if(rejectReason != null) {
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
} }
// in the end we must throw an exception (even if this is // in the end we must throw an exception (even if this is
// not an error, just to abort the current process // not an error, just to abort the current process
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
+ redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
} }
// if we are already doing a shutdown we don't need to retry // if we are already doing a shutdown we don't need to retry
// crawling // crawling
if (Thread.currentThread().isInterrupted()) { if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
throw new IOException( throw new IOException(
"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$"); "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
} }
// retry crawling with new url // check if the redirected URL is the same as the requested URL
request.redirectURL(redirectionUrl); // this shortcuts a time-out using retryCount
return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); if (redirectionUrl.equals(url)) {
} this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1);
// we don't want to follow redirects throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, }
FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline // retry crawling with new url
+ "' for URL '" + requestURLString + "'$"); request.redirectURL(redirectionUrl);
} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) { return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
// the transfer is ok }
// we don't want to follow redirects
/* this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
* When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$");
*/ } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
long contentLength = client.getHttpResponse().getEntity().getContentLength(); // the transfer is ok
InputStream contentStream;
if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) { /*
byte[] content = null; * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
try { */
content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize); long contentLength = client.getHttpResponse().getEntity().getContentLength();
Cache.store(url, responseHeader, content); InputStream contentStream;
} catch (final IOException e) { if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e); byte[] content = null;
} finally { try {
client.finish(); content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
} Cache.store(url, responseHeader, content);
} catch (final IOException e) {
contentStream = new ByteArrayInputStream(content); this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
} else { } finally {
/* client.finish();
* Content length may already be known now : check it before opening a stream }
*/
if (maxFileSize >= 0 && contentLength > maxFileSize) { contentStream = new ByteArrayInputStream(content);
throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes"); } else {
} /*
/* * Content length may already be known now : check it before opening a stream
* Create a HTTPInputStream delegating to */
* client.getContentstream(). Close method will ensure client is if (maxFileSize >= 0 && contentLength > maxFileSize) {
* properly closed. throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
*/ }
contentStream = new HTTPInputStream(client); /*
/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */ * Create a HTTPInputStream delegating to
if(maxFileSize >= 0) { * client.getContentstream(). Close method will ensure client is
contentStream = new StrictLimitInputStream(contentStream, maxFileSize, * properly closed.
"Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize)); */
} contentStream = new HTTPInputStream(client);
} /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
if(maxFileSize >= 0) {
return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream); contentStream = new StrictLimitInputStream(contentStream, maxFileSize,
} else { "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize));
client.finish(); }
// if the response has not the right response type then reject file }
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline } else {
+ "' for URL '" + requestURLString + "'$"); client.finish();
} // if the response has not the right response type then reject file
} this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
/** throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline
* Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null. + "' for URL '" + requestURLString + "'$");
* @return redirect URL }
* @throws IOException when an error occured }
*/
private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url, /**
final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString) * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
throws IOException { * @return redirect URL
// read redirection URL * @throws IOException when an error occured
String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); */
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString)
if (redirectionUrlString.isEmpty()) { throws IOException {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, // read redirection URL
FailCategory.TEMPORARY_NETWORK_FAILURE, String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
"no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode()); redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$"); if (redirectionUrlString.isEmpty()) {
} this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE,
// normalize URL "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode());
final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$");
// restart crawling with new url }
this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
+ requestURLString); // normalize URL
this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
this.sb.webStructure.generateCitationReference(url, redirectionUrl); // restart crawling with new url
this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { + requestURLString);
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
} this.sb.webStructure.generateCitationReference(url, redirectionUrl);
return redirectionUrl;
} if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
/** FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
* Create request header for loading content. }
* @param request search request return redirectionUrl;
* @param agent agent identification information }
* @return a request header
* @throws IOException when an error occured /**
*/ * Create request header for loading content.
private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent) * @param request search request
throws IOException { * @param agent agent identification information
final RequestHeader requestHeader = new RequestHeader(); * @return a request header
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); * @throws IOException when an error occured
if (request.referrerhash() != null) { */
private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
throws IOException {
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
if (request.referrerhash() != null) {
DigestURL refererURL = this.sb.getURL(request.referrerhash()); DigestURL refererURL = this.sb.getURL(request.referrerhash());
if (refererURL != null) { if (refererURL != null) {
requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true)); requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
} }
} }
requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT)); requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE)); this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET)); this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
return requestHeader; return requestHeader;
} }
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
@ -347,10 +350,10 @@ public final class HTTPLoader {
// resolve yacy and yacyh domains // resolve yacy and yacyh domains
final AlternativeDomainNames yacyResolver = this.sb.peers; final AlternativeDomainNames yacyResolver = this.sb.peers;
if(yacyResolver != null) { if(yacyResolver != null) {
final String yAddress = yacyResolver.resolve(host); final String yAddress = yacyResolver.resolve(host);
if(yAddress != null) { if(yAddress != null) {
url = new DigestURL(url.getProtocol() + "://" + yAddress + path); url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
} }
} }
// take a file from the net // take a file from the net
@ -366,41 +369,39 @@ public final class HTTPLoader {
client.setHeader(requestHeader.entrySet()); client.setHeader(requestHeader.entrySet());
// send request // send request
final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false); final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false);
final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
String requestURLString = request.url().toNormalform(true); String requestURLString = request.url().toNormalform(true);
// check redirection // check redirection
if (statusCode > 299 && statusCode < 310) { if (statusCode > 299 && statusCode < 310) {
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(), final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(),
responseHeader, requestURLString); responseHeader, requestURLString);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// we have two use cases here: loading from a crawl or just loading the url. Check this: // we have two use cases here: loading from a crawl or just loading the url. Check this:
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
// put redirect url on the crawler queue to repeat a double-check // put redirect url on the crawler queue to repeat a double-check
/* We have to clone the request instance and not to modify directly its URL, /* We have to clone the request instance and not to modify directly its URL,
* otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
Request redirectedRequest = new Request(request.initiator(), Request redirectedRequest = new Request(request.initiator(),
redirectionUrl, redirectionUrl,
request.referrerhash(), request.referrerhash(),
request.name(), request.name(),
request.appdate(), request.appdate(),
request.profileHandle(), request.profileHandle(),
request.depth(), request.depth(),
request.timezoneOffset()); request.timezoneOffset());
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
// in the end we must throw an exception (even if this is not an error, just to abort the current process // in the end we must throw an exception (even if this is not an error, just to abort the current process
if(rejectReason != null) { if(rejectReason != null) {
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
} }
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
}
}
// if we are already doing a shutdown we don't need to retry crawling // if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) { if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
@ -410,15 +411,15 @@ public final class HTTPLoader {
// retry crawling with new url // retry crawling with new url
request.redirectURL(redirectionUrl); request.redirectURL(redirectionUrl);
return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
} }
// we don't want to follow redirects // we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} else if (responseBody == null) { } else if (responseBody == null) {
// no response, reject file // no response, reject file
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} else if (statusCode == 200 || statusCode == 203) { } else if (statusCode == 200 || statusCode == 203) {
// the transfer is ok // the transfer is ok
// we write the new cache entry to file system directly // we write the new cache entry to file system directly
@ -427,8 +428,8 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading // check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) { if (maxFileSize >= 0 && contentLength > maxFileSize) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$"); throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
} }
// create a new cache entry // create a new cache entry
@ -442,9 +443,9 @@ public final class HTTPLoader {
); );
return response; return response;
} else { } else {
// if the response has not the right response type then reject file // if the response has not the right response type then reject file
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} }
} }
@ -485,17 +486,17 @@ public final class HTTPLoader {
final HTTPClient client = new HTTPClient(agent); final HTTPClient client = new HTTPClient(agent);
client.setTimout(20000); client.setTimout(20000);
client.setHeader(requestHeader.entrySet()); client.setHeader(requestHeader.entrySet());
final byte[] responseBody = client.GETbytes(request.url(), null, null, false); final byte[] responseBody = client.GETbytes(request.url(), null, null, false);
final int code = client.getHttpResponse().getStatusLine().getStatusCode(); final int code = client.getHttpResponse().getStatusLine().getStatusCode();
final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
// FIXME: 30*-handling (bottom) is never reached // FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true // we always get the final content because httpClient.followRedirects = true
if (responseBody != null && (code == 200 || code == 203)) { if (responseBody != null && (code == 200 || code == 203)) {
// the transfer is ok // the transfer is ok
//statistics: //statistics:
ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length); ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
// we write the new cache entry to file system directly // we write the new cache entry to file system directly
@ -513,7 +514,7 @@ public final class HTTPLoader {
} else if (code > 299 && code < 310) { } else if (code > 299 && code < 310) {
if (header.containsKey(HeaderFramework.LOCATION)) { if (header.containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL // getting redirection URL
String redirectionUrlString = header.get(HeaderFramework.LOCATION); String redirectionUrlString = header.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim(); redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.isEmpty()) { if (redirectionUrlString.isEmpty()) {
@ -535,7 +536,7 @@ public final class HTTPLoader {
} }
} else { } else {
// if the response has not the right response type then reject file // if the response has not the right response type then reject file
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
} }
return response; return response;
} }

@ -399,7 +399,7 @@ public final class LoaderDispatcher {
// load resource from the internet // load resource from the internet
StreamResponse response; StreamResponse response;
if (protocol.equals("http") || protocol.equals("https")) { if (protocol.equals("http") || protocol.equals("https")) {
response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent); response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent);
} else if (protocol.equals("ftp")) { } else if (protocol.equals("ftp")) {
response = this.ftpLoader.openInputStream(request, true); response = this.ftpLoader.openInputStream(request, true);
} else if (protocol.equals("smb")) { } else if (protocol.equals("smb")) {

Loading…
Cancel
Save