* process content-length as soon as it is received

* corrected indentation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6206 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
lotus 16 years ago
parent 5240d22773
commit 9cfe89c8fc

@ -152,7 +152,9 @@ public final class HTTPLoader {
httpResponse res = null;
try {
// send request
res = client.GET(entry.url().toString());
res = client.GET(entry.url().toString(), maxFileSize);
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok
@ -162,32 +164,36 @@ public final class HTTPLoader {
// request has been placed and result has been returned. work off response
//try {
if (!Parser.supportsMime(res.getResponseHeader().mime())) {
// if the response has not the right file type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
} else {
// get the content length and check if the length is allowed
long contentLength = res.getResponseHeader().getContentLength();
if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
}
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
contentLength = responseBody.length;
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
}
htCache.setCacheArray(responseBody);
}
return htCache;
// if the response has not the right file type then reject file
if (!Parser.supportsMime(res.getResponseHeader().mime())) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
}
/*
// check if the content length is allowed
long contentLength = res.getResponseHeader().getContentLength();
if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (HEAD)");
}
*/
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
long contentLength = responseBody.length;
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
}
htCache.setCacheArray(responseBody);
return htCache;
/*
} catch (final SocketException e) {
// this may happen if the client suddenly closes its connection

@ -48,7 +48,6 @@ import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.ByteArrayRequestEntity;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.HeadMethod;
import org.apache.commons.httpclient.methods.InputStreamRequestEntity;
import org.apache.commons.httpclient.methods.PostMethod;
@ -257,7 +256,19 @@ public class httpClient {
* @throws IOException
*/
public httpResponse GET(final String uri) throws IOException {
final HttpMethod get = new GetMethod(uri);
return GET(uri, Long.MAX_VALUE);
}
/**
* This method GETs a page from the server.
*
* @param uri The URI to the page which should be GET.
* @param maxfilesize the maximum allowed filesize (else IOException)
* @return InputStream of content (body)
* @throws IOException
*/
public httpResponse GET(final String uri, long maxfilesize) throws IOException {
final HttpMethod get = new httpClientGetMethod(uri, maxfilesize);
get.setFollowRedirects(followRedirects);
get.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
return execute(get);

@ -0,0 +1,65 @@
// httpClientGetMethod.java
// (C) 2009 by David Wieditz; lotus@users.berlios.de
// first published 13.7.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $
// $LastChangedRevision: 4558 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.http;
import java.io.IOException;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpConnection;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.methods.GetMethod;
/**
* this class implements the ability for a maxfilesize
* @author lotus
*
*/
public class httpClientGetMethod extends GetMethod {
private long maxfilesize = Long.MAX_VALUE;
public httpClientGetMethod(String uri, long maxfilesize) {
super(uri);
this.maxfilesize = maxfilesize;
}
@Override
protected void readResponseHeaders(HttpState state, HttpConnection conn) throws IOException, HttpException {
super.readResponseHeaders(state, conn);
// already processing the header to be able to throw an exception
Header contentlengthHeader = getResponseHeader("content-length");
long contentlength = 0;
if (contentlengthHeader != null) {
try { contentlength = Long.parseLong(contentlengthHeader.getValue()); } catch (NumberFormatException e) { }
}
if (contentlength > maxfilesize) {
throw new IOException("Content-Length " + contentlength + " larger than maxfilesize " + maxfilesize);
}
}
}
Loading…
Cancel
Save