From e7e381d11016e2dc829d9f50c32c5282c9d00b44 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 15 May 2012 12:25:46 +0200 Subject: [PATCH] added configuration to switch off redirection following in crawler --- defaults/yacy.init | 1 + .../anomic/crawler/retrieval/HTTPLoader.java | 86 ++++++++++--------- .../net/yacy/search/SwitchboardConstants.java | 1 + 3 files changed, 49 insertions(+), 39 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 8e64c6ca6..9c0ad2a23 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -698,6 +698,7 @@ crawler.http.acceptEncoding=gzip crawler.http.acceptLanguage=en-us,en;q=0.5 crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7 crawler.http.maxFileSize=10485760 +crawler.http.FollowRedirects=true; # ftp crawler specific settings; size in bytes crawler.ftp.maxFileSize=10485760 diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 3e5d81a2f..d68ccc743 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -38,6 +38,7 @@ import net.yacy.kelondro.io.ByteCount; import net.yacy.kelondro.logging.Log; import net.yacy.repository.Blacklist; import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segments; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.Latency; @@ -127,13 +128,15 @@ public final class HTTPLoader { client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice client.setTimout(this.socketTimeout); client.setHeader(requestHeader.entrySet()); - // send request - final byte[] responseBody = client.GETbytes(url, maxFileSize); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - final int code = client.getHttpResponse().getStatusLine().getStatusCode(); - if (code > 299 && code < 310) { - // redirection (content may be empty) + // send request + final byte[] responseBody = client.GETbytes(url, maxFileSize); + final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + + if (code > 299 && code < 310) { + // redirection (content may be empty) + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { if (header.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL String redirectionUrlString = header.get(HeaderFramework.LOCATION); @@ -172,40 +175,45 @@ public final class HTTPLoader { this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided", code); throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } - } else if (responseBody == null) { - // no response, reject file - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", code); - throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); - } else if (code == 200 || code == 203) { - // the transfer is ok - - // we write the new cache entry to file system directly - final long contentLength = responseBody.length; - ByteCount.addAccountCount(ByteCount.CRAWLER, contentLength); - - // check length again in case it was not possible to get the length before loading - if (maxFileSize > 0 && contentLength > maxFileSize) { - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", code); - throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); - } - - // create a new cache entry - final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); - response = new Response( - request, - requestHeader, - header, - Integer.toString(code), - profile, - responseBody - ); - - return response; - } else { - // if the response has not the right response type then reject file - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", code); - throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + } else { + // we don't want to follow redirects + this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", code); + throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + } + } else if (responseBody == null) { + // no response, reject file + this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", code); + throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + } else if (code == 200 || code == 203) { + // the transfer is ok + + // we write the new cache entry to file system directly + final long contentLength = responseBody.length; + ByteCount.addAccountCount(ByteCount.CRAWLER, contentLength); + + // check length again in case it was not possible to get the length before loading + if (maxFileSize > 0 && contentLength > maxFileSize) { + this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", code); + throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); } + + // create a new cache entry + final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); + response = new Response( + request, + requestHeader, + header, + Integer.toString(code), + profile, + responseBody + ); + + return response; + } else { + // if the response has not the right response type then reject file + this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", code); + throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + } } public static Response load(final Request request) throws IOException { diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 01bc22d01..b59d3d395 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -296,6 +296,7 @@ public final class SwitchboardConstants { *

Name of the setting how many active crawler-threads may maximal be running on the same time

*/ public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; + public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; public static final String YACY_MODE_DEBUG = "yacyDebugMode"; /**