From 9be36800a4dc0d2c816c47084e34146a6cef0177 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 20 Dec 2020 19:44:16 +0100 Subject: [PATCH] increased redirect depth by one this makes sense if one redirect replaces http with https and another replaces www subdomain by without (and vice versa) --- .../yacy/crawler/retrieval/HTTPLoader.java | 521 +++++++++--------- .../net/yacy/repository/LoaderDispatcher.java | 2 +- 2 files changed, 262 insertions(+), 261 deletions(-) diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 1ba2d793e..0703bc85e 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -85,8 +85,8 @@ public final class HTTPLoader { Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); return doc; } - - /** + + /** * Open an input stream on a requested HTTP resource. When the resource content size is small * (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance. * @param request @@ -98,228 +98,231 @@ public final class HTTPLoader { * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. * @throws IOException when an error occurred */ - public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount, - final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) - throws IOException { - if (retryCount < 0) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); - throw new IOException( - "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); - } - DigestURL url = request.url(); - - final String host = url.getHost(); - if (host == null || host.length() < 2) { - throw new IOException("host is not well-formed: '" + host + "'"); - } - final String path = url.getFile(); - int port = url.getPort(); - final boolean ssl = url.getProtocol().equals("https"); - if (port < 0) - port = (ssl) ? 443 : 80; - - // check if url is in blacklist - final String hostlow = host.toLowerCase(Locale.ROOT); - if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, - "url in blacklist", -1); - throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); - } - - // resolve yacy and yacyh domains - final AlternativeDomainNames yacyResolver = this.sb.peers; - if (yacyResolver != null) { - final String yAddress = yacyResolver.resolve(host); - if (yAddress != null) { - url = new DigestURL(url.getProtocol() + "://" + yAddress + path); - } - } - - // create a request header - final RequestHeader requestHeader = createRequestheader(request, agent); - - // HTTP-Client - final HTTPClient client = new HTTPClient(agent); - client.setRedirecting(false); // we want to handle redirection - // ourselves, so we don't index pages - // twice - client.setTimout(this.socketTimeout); - client.setHeader(requestHeader.entrySet()); - - // send request - client.GET(url, false); - final StatusLine statusline = client.getHttpResponse().getStatusLine(); - final int statusCode = statusline.getStatusCode(); - final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); - String requestURLString = request.url().toNormalform(true); - - // check redirection - if (statusCode > 299 && statusCode < 310) { - client.finish(); - - final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, - responseHeader, requestURLString); - - if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { - // we have two use cases here: loading from a crawl or just - // loading the url. Check this: - if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { - // put redirect url on the crawler queue to repeat a - // double-check - /* We have to clone the request instance and not to modify directly its URL, - * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ + public StreamResponse openInputStream( + final Request request, CrawlProfile profile, final int retryCount, + final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent + ) throws IOException { + if (retryCount < 0) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); + throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); + } + DigestURL url = request.url(); + + final String host = url.getHost(); + if (host == null || host.length() < 2) { + throw new IOException("host is not well-formed: '" + host + "'"); + } + final String path = url.getFile(); + int port = url.getPort(); + final boolean ssl = url.getProtocol().equals("https"); + if (port < 0) + port = (ssl) ? 443 : 80; + + // check if url is in blacklist + final String hostlow = host.toLowerCase(Locale.ROOT); + if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, + "url in blacklist", -1); + throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); + } + + // resolve yacy and yacyh domains + final AlternativeDomainNames yacyResolver = this.sb.peers; + if (yacyResolver != null) { + final String yAddress = yacyResolver.resolve(host); + if (yAddress != null) { + url = new DigestURL(url.getProtocol() + "://" + yAddress + path); + } + } + + // create a request header + final RequestHeader requestHeader = createRequestheader(request, agent); + + // HTTP-Client + final HTTPClient client = new HTTPClient(agent); + client.setRedirecting(false); // we want to handle redirection + // ourselves, so we don't index pages + // twice + client.setTimout(this.socketTimeout); + client.setHeader(requestHeader.entrySet()); + + // send request + client.GET(url, false); + final StatusLine statusline = client.getHttpResponse().getStatusLine(); + final int statusCode = statusline.getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); + String requestURLString = request.url().toNormalform(true); + + // check redirection + if (statusCode > 299 && statusCode < 310) { + client.finish(); + + final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString); + + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { + // we have two use cases here: loading from a crawl or just + // loading the url. Check this: + if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { + // put redirect url on the crawler queue to repeat a + // double-check + /* We have to clone the request instance and not to modify directly its URL, + * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ Request redirectedRequest = new Request(request.initiator(), - redirectionUrl, - request.referrerhash(), - request.name(), - request.appdate(), - request.profileHandle(), - request.depth(), - request.timezoneOffset()); - String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); - if(rejectReason != null) { + redirectionUrl, + request.referrerhash(), + request.name(), + request.appdate(), + request.profileHandle(), + request.depth(), + request.timezoneOffset()); + String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); + if(rejectReason != null) { throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); - } - // in the end we must throw an exception (even if this is - // not an error, just to abort the current process - throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " - + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); - } - - // if we are already doing a shutdown we don't need to retry - // crawling - if (Thread.currentThread().isInterrupted()) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); - throw new IOException( - "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$"); - } - - // retry crawling with new url - request.redirectURL(redirectionUrl); - return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); - } - // we don't want to follow redirects - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); - throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline - + "' for URL '" + requestURLString + "'$"); - } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) { - // the transfer is ok - - /* - * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local - */ - long contentLength = client.getHttpResponse().getEntity().getContentLength(); - InputStream contentStream; - if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) { - byte[] content = null; - try { - content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize); - Cache.store(url, responseHeader, content); - } catch (final IOException e) { - this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e); - } finally { - client.finish(); - } - - contentStream = new ByteArrayInputStream(content); - } else { - /* - * Content length may already be known now : check it before opening a stream - */ - if (maxFileSize >= 0 && contentLength > maxFileSize) { - throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes"); - } - /* - * Create a HTTPInputStream delegating to - * client.getContentstream(). Close method will ensure client is - * properly closed. - */ - contentStream = new HTTPInputStream(client); - /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */ - if(maxFileSize >= 0) { - contentStream = new StrictLimitInputStream(contentStream, maxFileSize, - "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize)); - } - } - - return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream); - } else { - client.finish(); - // if the response has not the right response type then reject file - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); - throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline - + "' for URL '" + requestURLString + "'$"); - } - } - - /** - * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null. - * @return redirect URL - * @throws IOException when an error occured - */ - private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url, - final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString) - throws IOException { - // read redirection URL - String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); - redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); - - if (redirectionUrlString.isEmpty()) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.TEMPORARY_NETWORK_FAILURE, - "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode()); - throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline - + "' for URL '" + requestURLString + "'$"); - } - - // normalize URL - final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); - - // restart crawling with new url - this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL " - + requestURLString); - this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); - - this.sb.webStructure.generateCitationReference(url, redirectionUrl); - - if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode()); - } - return redirectionUrl; - } - - /** - * Create request header for loading content. - * @param request search request - * @param agent agent identification information - * @return a request header - * @throws IOException when an error occured - */ - private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent) - throws IOException { - final RequestHeader requestHeader = new RequestHeader(); - requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); - if (request.referrerhash() != null) { + } + // in the end we must throw an exception (even if this is + // not an error, just to abort the current process + throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); + } + + // if we are already doing a shutdown we don't need to retry + // crawling + if (Thread.currentThread().isInterrupted()) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); + throw new IOException( + "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$"); + } + + // check if the redirected URL is the same as the requested URL + // this shortcuts a time-out using retryCount + if (redirectionUrl.equals(url)) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1); + throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); + } + + // retry crawling with new url + request.redirectURL(redirectionUrl); + return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); + } + // we don't want to follow redirects + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); + throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$"); + } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) { + // the transfer is ok + + /* + * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local + */ + long contentLength = client.getHttpResponse().getEntity().getContentLength(); + InputStream contentStream; + if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) { + byte[] content = null; + try { + content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize); + Cache.store(url, responseHeader, content); + } catch (final IOException e) { + this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e); + } finally { + client.finish(); + } + + contentStream = new ByteArrayInputStream(content); + } else { + /* + * Content length may already be known now : check it before opening a stream + */ + if (maxFileSize >= 0 && contentLength > maxFileSize) { + throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes"); + } + /* + * Create a HTTPInputStream delegating to + * client.getContentstream(). Close method will ensure client is + * properly closed. + */ + contentStream = new HTTPInputStream(client); + /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */ + if(maxFileSize >= 0) { + contentStream = new StrictLimitInputStream(contentStream, maxFileSize, + "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize)); + } + } + + return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream); + } else { + client.finish(); + // if the response has not the right response type then reject file + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); + throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline + + "' for URL '" + requestURLString + "'$"); + } + } + + /** + * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null. + * @return redirect URL + * @throws IOException when an error occured + */ + private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url, + final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString) + throws IOException { + // read redirection URL + String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); + redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); + + if (redirectionUrlString.isEmpty()) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.TEMPORARY_NETWORK_FAILURE, + "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode()); + throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline + + "' for URL '" + requestURLString + "'$"); + } + + // normalize URL + final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); + + // restart crawling with new url + this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL " + + requestURLString); + this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); + + this.sb.webStructure.generateCitationReference(url, redirectionUrl); + + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode()); + } + return redirectionUrl; + } + + /** + * Create request header for loading content. + * @param request search request + * @param agent agent identification information + * @return a request header + * @throws IOException when an error occured + */ + private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent) + throws IOException { + final RequestHeader requestHeader = new RequestHeader(); + requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); + if (request.referrerhash() != null) { DigestURL refererURL = this.sb.getURL(request.referrerhash()); if (refererURL != null) { requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true)); } - } - - requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT)); - requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, - this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE)); - requestHeader.put(HeaderFramework.ACCEPT_CHARSET, - this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET)); - requestHeader.put(HeaderFramework.ACCEPT_ENCODING, - this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); - return requestHeader; - } + } + + requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT)); + requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, + this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE)); + requestHeader.put(HeaderFramework.ACCEPT_CHARSET, + this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET)); + requestHeader.put(HeaderFramework.ACCEPT_ENCODING, + this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); + return requestHeader; + } private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { @@ -347,10 +350,10 @@ public final class HTTPLoader { // resolve yacy and yacyh domains final AlternativeDomainNames yacyResolver = this.sb.peers; if(yacyResolver != null) { - final String yAddress = yacyResolver.resolve(host); - if(yAddress != null) { - url = new DigestURL(url.getProtocol() + "://" + yAddress + path); - } + final String yAddress = yacyResolver.resolve(host); + if(yAddress != null) { + url = new DigestURL(url.getProtocol() + "://" + yAddress + path); + } } // take a file from the net @@ -366,41 +369,39 @@ public final class HTTPLoader { client.setHeader(requestHeader.entrySet()); // send request - final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false); + final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false); final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); - final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); String requestURLString = request.url().toNormalform(true); // check redirection - if (statusCode > 299 && statusCode < 310) { + if (statusCode > 299 && statusCode < 310) { - final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(), - responseHeader, requestURLString); + final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(), + responseHeader, requestURLString); - if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { - // we have two use cases here: loading from a crawl or just loading the url. Check this: - if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { + // we have two use cases here: loading from a crawl or just loading the url. Check this: + if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { // put redirect url on the crawler queue to repeat a double-check - /* We have to clone the request instance and not to modify directly its URL, - * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ + /* We have to clone the request instance and not to modify directly its URL, + * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ Request redirectedRequest = new Request(request.initiator(), - redirectionUrl, - request.referrerhash(), - request.name(), - request.appdate(), - request.profileHandle(), - request.depth(), - request.timezoneOffset()); - String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); - // in the end we must throw an exception (even if this is not an error, just to abort the current process - if(rejectReason != null) { + redirectionUrl, + request.referrerhash(), + request.name(), + request.appdate(), + request.profileHandle(), + request.depth(), + request.timezoneOffset()); + String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); + // in the end we must throw an exception (even if this is not an error, just to abort the current process + if(rejectReason != null) { throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); - } + } throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); - + } - } - // if we are already doing a shutdown we don't need to retry crawling if (Thread.currentThread().isInterrupted()) { this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); @@ -410,15 +411,15 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); - } + } // we don't want to follow redirects this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } else if (responseBody == null) { - // no response, reject file + // no response, reject file this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); - } else if (statusCode == 200 || statusCode == 203) { + } else if (statusCode == 200 || statusCode == 203) { // the transfer is ok // we write the new cache entry to file system directly @@ -427,8 +428,8 @@ public final class HTTPLoader { // check length again in case it was not possible to get the length before loading if (maxFileSize >= 0 && contentLength > maxFileSize) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); - throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$"); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); + throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$"); } // create a new cache entry @@ -442,9 +443,9 @@ public final class HTTPLoader { ); return response; - } else { + } else { // if the response has not the right response type then reject file - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } } @@ -485,17 +486,17 @@ public final class HTTPLoader { final HTTPClient client = new HTTPClient(agent); client.setTimout(20000); client.setHeader(requestHeader.entrySet()); - final byte[] responseBody = client.GETbytes(request.url(), null, null, false); + final byte[] responseBody = client.GETbytes(request.url(), null, null, false); final int code = client.getHttpResponse().getStatusLine().getStatusCode(); - final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); + final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); // FIXME: 30*-handling (bottom) is never reached // we always get the final content because httpClient.followRedirects = true - if (responseBody != null && (code == 200 || code == 203)) { + if (responseBody != null && (code == 200 || code == 203)) { // the transfer is ok - //statistics: - ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length); + //statistics: + ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length); // we write the new cache entry to file system directly @@ -513,7 +514,7 @@ public final class HTTPLoader { } else if (code > 299 && code < 310) { if (header.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL - String redirectionUrlString = header.get(HeaderFramework.LOCATION); + String redirectionUrlString = header.get(HeaderFramework.LOCATION); redirectionUrlString = redirectionUrlString.trim(); if (redirectionUrlString.isEmpty()) { @@ -535,7 +536,7 @@ public final class HTTPLoader { } } else { // if the response has not the right response type then reject file - throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } return response; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 03feb047d..cccec50c3 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -399,7 +399,7 @@ public final class LoaderDispatcher { // load resource from the internet StreamResponse response; if (protocol.equals("http") || protocol.equals("https")) { - response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent); + response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent); } else if (protocol.equals("ftp")) { response = this.ftpLoader.openInputStream(request, true); } else if (protocol.equals("smb")) {