From 9cfe89c8fc83182582e78a1b83eac9ea7613f8ae Mon Sep 17 00:00:00 2001 From: lotus Date: Mon, 13 Jul 2009 19:55:13 +0000 Subject: [PATCH] * process content-length as soon as it is received * corrected indentation git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6206 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/HTTPLoader.java | 60 +++++++++-------- source/de/anomic/http/httpClient.java | 15 ++++- .../de/anomic/http/httpClientGetMethod.java | 65 +++++++++++++++++++ 3 files changed, 111 insertions(+), 29 deletions(-) create mode 100644 source/de/anomic/http/httpClientGetMethod.java diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 4d6484c06..fdf506e79 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -152,7 +152,9 @@ public final class HTTPLoader { httpResponse res = null; try { // send request - res = client.GET(entry.url().toString()); + res = client.GET(entry.url().toString(), maxFileSize); + // FIXME: 30*-handling (bottom) is never reached + // we always get the final content because httpClient.followRedirects = true if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { // the transfer is ok @@ -162,32 +164,36 @@ public final class HTTPLoader { // request has been placed and result has been returned. work off response //try { - if (!Parser.supportsMime(res.getResponseHeader().mime())) { - // if the response has not the right file type then reject file - sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type"); - throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); - } else { - // get the content length and check if the length is allowed - long contentLength = res.getResponseHeader().getContentLength(); - if (maxFileSize >= 0 && contentLength > maxFileSize) { - sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded"); - throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes."); - } - - // we write the new cache entry to file system directly - res.setAccountingName("CRAWLER"); - final byte[] responseBody = res.getData(); - contentLength = responseBody.length; - - // check length again in case it was not possible to get the length before loading - if (maxFileSize >= 0 && contentLength > maxFileSize) { - sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded"); - throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes."); - } - - htCache.setCacheArray(responseBody); - } - return htCache; + + // if the response has not the right file type then reject file + if (!Parser.supportsMime(res.getResponseHeader().mime())) { + sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type"); + throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); + } + + /* + // check if the content length is allowed + long contentLength = res.getResponseHeader().getContentLength(); + if (maxFileSize >= 0 && contentLength > maxFileSize) { + sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded"); + throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (HEAD)"); + } + */ + + // we write the new cache entry to file system directly + res.setAccountingName("CRAWLER"); + final byte[] responseBody = res.getData(); + long contentLength = responseBody.length; + + // check length again in case it was not possible to get the length before loading + if (maxFileSize >= 0 && contentLength > maxFileSize) { + sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded"); + throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); + } + + htCache.setCacheArray(responseBody); + + return htCache; /* } catch (final SocketException e) { // this may happen if the client suddenly closes its connection diff --git a/source/de/anomic/http/httpClient.java b/source/de/anomic/http/httpClient.java index 85a2ac880..84450a807 100644 --- a/source/de/anomic/http/httpClient.java +++ b/source/de/anomic/http/httpClient.java @@ -48,7 +48,6 @@ import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.ByteArrayRequestEntity; -import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.HeadMethod; import org.apache.commons.httpclient.methods.InputStreamRequestEntity; import org.apache.commons.httpclient.methods.PostMethod; @@ -257,7 +256,19 @@ public class httpClient { * @throws IOException */ public httpResponse GET(final String uri) throws IOException { - final HttpMethod get = new GetMethod(uri); + return GET(uri, Long.MAX_VALUE); + } + + /** + * This method GETs a page from the server. + * + * @param uri The URI to the page which should be GET. + * @param maxfilesize the maximum allowed filesize (else IOException) + * @return InputStream of content (body) + * @throws IOException + */ + public httpResponse GET(final String uri, long maxfilesize) throws IOException { + final HttpMethod get = new httpClientGetMethod(uri, maxfilesize); get.setFollowRedirects(followRedirects); get.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); return execute(get); diff --git a/source/de/anomic/http/httpClientGetMethod.java b/source/de/anomic/http/httpClientGetMethod.java new file mode 100644 index 000000000..56e55c316 --- /dev/null +++ b/source/de/anomic/http/httpClientGetMethod.java @@ -0,0 +1,65 @@ +// httpClientGetMethod.java +// (C) 2009 by David Wieditz; lotus@users.berlios.de +// first published 13.7.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $ +// $LastChangedRevision: 4558 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.http; + +import java.io.IOException; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpState; +import org.apache.commons.httpclient.methods.GetMethod; + +/** + * this class implements the ability for a maxfilesize + * @author lotus + * + */ +public class httpClientGetMethod extends GetMethod { + + private long maxfilesize = Long.MAX_VALUE; + + public httpClientGetMethod(String uri, long maxfilesize) { + super(uri); + this.maxfilesize = maxfilesize; + } + + @Override + protected void readResponseHeaders(HttpState state, HttpConnection conn) throws IOException, HttpException { + super.readResponseHeaders(state, conn); + + // already processing the header to be able to throw an exception + Header contentlengthHeader = getResponseHeader("content-length"); + long contentlength = 0; + if (contentlengthHeader != null) { + try { contentlength = Long.parseLong(contentlengthHeader.getValue()); } catch (NumberFormatException e) { } + } + if (contentlength > maxfilesize) { + throw new IOException("Content-Length " + contentlength + " larger than maxfilesize " + maxfilesize); + } + } +}