From dca9e16f51788689a8e17595464f8b69733e1cd9 Mon Sep 17 00:00:00 2001 From: f1ori Date: Sun, 21 Nov 2010 22:46:12 +0000 Subject: [PATCH] * don't index pages, which redirect, twice * there fore auto-redirection of HTTPClient for crawling is disabled and the old code is reactivated git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7332 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../anomic/crawler/retrieval/HTTPLoader.java | 71 ++++++++++--------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index c0bb1ef8f..97aa38ab1 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -109,45 +109,16 @@ public final class HTTPLoader { // HTTP-Client final HTTPClient client = new HTTPClient(); + client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice client.setTimout(socketTimeout); client.setHeader(requestHeader.entrySet()); // send request final byte[] responseBody = client.GETbytes(request.url().toString(), maxFileSize); final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); final int code = client.getHttpResponse().getStatusLine().getStatusCode(); - // FIXME: 30*-handling (bottom) is never reached - // we always get the final content because httpClient.followRedirects = true - - if (responseBody == null) { - // no response, reject file - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)"); - throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); - } else if (code == 200 || code == 203) { - // the transfer is ok - - // we write the new cache entry to file system directly - long contentLength = responseBody.length; - ByteCount.addAccountCount(ByteCount.CRAWLER, contentLength); - - // check length again in case it was not possible to get the length before loading - if (maxFileSize > 0 && contentLength > maxFileSize) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded"); - throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); - } - // create a new cache entry - final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); - response = new Response( - request, - requestHeader, - header, - Integer.toString(code), - mp == null ? null : new CrawlProfile(mp), - responseBody - ); - - return response; - } else if (code > 299 && code < 310) { + if (code > 299 && code < 310) { + // redirection (content may be empty) if (header.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL String redirectionUrlString = header.get(HeaderFramework.LOCATION); @@ -181,13 +152,45 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); return load(request, retryCount - 1, maxFileSize); + } else { + // no redirection url provided + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided"); + throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } - } else { + } else if (responseBody == null) { + // no response, reject file + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)"); + throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + } else if (code == 200 || code == 203) { + // the transfer is ok + + // we write the new cache entry to file system directly + long contentLength = responseBody.length; + ByteCount.addAccountCount(ByteCount.CRAWLER, contentLength); + + // check length again in case it was not possible to get the length before loading + if (maxFileSize > 0 && contentLength > maxFileSize) { + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded"); + throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); + } + + // create a new cache entry + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); + response = new Response( + request, + requestHeader, + header, + Integer.toString(code), + mp == null ? null : new CrawlProfile(mp), + responseBody + ); + + return response; + } else { // if the response has not the right response type then reject file sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } - return response; } public static Response load(final Request request) throws IOException {