diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 4f427a1bf..09a6c9c3d 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -28,6 +28,8 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import org.apache.http.StatusLine; + import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.protocol.ClientIdentification; @@ -142,14 +144,16 @@ public final class HTTPLoader { // send request client.GET(url, false); - final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final StatusLine statusline = client.getHttpResponse().getStatusLine(); + final int statusCode = statusline.getStatusCode(); final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); String requestURLString = request.url().toNormalform(true); // check redirection if (statusCode > 299 && statusCode < 310) { - - final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client, statusCode, + client.finish(); + + final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString); if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { @@ -182,7 +186,7 @@ public final class HTTPLoader { // we don't want to follow redirects this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); - throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$"); } else if (statusCode == 200 || statusCode == 203) { // the transfer is ok @@ -192,12 +196,14 @@ public final class HTTPLoader { */ long contentLength = client.getHttpResponse().getEntity().getContentLength(); if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) { - byte[] content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize); - + byte[] content = null; try { + content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize); Cache.store(url, responseHeader, content); } catch (final IOException e) { this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e); + } finally { + client.finish(); } return new ByteArrayInputStream(content); @@ -209,10 +215,11 @@ public final class HTTPLoader { */ return new HTTPInputStream(client); } else { + client.finish(); // if the response has not the right response type then reject file this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); - throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline + "' for URL '" + requestURLString + "'$"); } } @@ -223,7 +230,7 @@ public final class HTTPLoader { * @throws IOException when an error occured */ private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url, - final HTTPClient client, final int statusCode, final ResponseHeader responseHeader, String requestURLString) + final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString) throws IOException { // read redirection URL String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); @@ -232,8 +239,8 @@ public final class HTTPLoader { if (redirectionUrlString.isEmpty()) { this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, - "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode); - throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode()); + throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$"); } @@ -241,7 +248,7 @@ public final class HTTPLoader { final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); // restart crawling with new url - this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL " + requestURLString); this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); @@ -249,7 +256,7 @@ public final class HTTPLoader { if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode); + FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode()); } return redirectionUrl; } @@ -335,7 +342,7 @@ public final class HTTPLoader { // check redirection if (statusCode > 299 && statusCode < 310) { - final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client, statusCode, + final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(), responseHeader, requestURLString); if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {