From 63eadfdf844115548df983c311bee3afc5a7d7ea Mon Sep 17 00:00:00 2001 From: danielr Date: Tue, 24 Jun 2008 19:11:27 +0000 Subject: [PATCH] fixed unlimited FileSizeLimit git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4954 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/SettingsAck_p.java | 3 +++ source/de/anomic/crawler/HTTPLoader.java | 18 +++++++++++++----- .../http/httpdBoundedSizeOutputStream.java | 2 +- source/de/anomic/tools/diskUsage.java | 4 +--- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 73e752845..498c8d47d 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -587,6 +587,9 @@ public class SettingsAck_p { long maxHttpSize; try { maxHttpSize = Integer.valueOf(maxSizeStr).intValue(); + if(maxHttpSize < 0) { + maxHttpSize = -1; + } env.setConfig("crawler.http.maxFileSize", Long.toString(maxHttpSize)); } catch (NumberFormatException e) { prop.put("info", "30"); diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 9bd3e354a..3762c0036 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -59,6 +59,7 @@ import de.anomic.http.JakartaCommonsHttpClient; import de.anomic.http.JakartaCommonsHttpResponse; import de.anomic.http.httpHeader; import de.anomic.http.httpdBoundedSizeOutputStream; +import de.anomic.http.httpdByteCountOutputStream; import de.anomic.http.httpdLimitExceededException; import de.anomic.index.indexReferenceBlacklist; import de.anomic.plasma.plasmaHTCache; @@ -154,6 +155,7 @@ public final class HTTPLoader { // take a file from the net plasmaHTCache.Entry htCache = null; + final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE); try { // create a request header httpHeader requestHeader = new httpHeader(); @@ -217,10 +219,16 @@ public final class HTTPLoader { long contentLength = res.getResponseHeader().contentLength(); // check the maximum allowed file size - if (contentLength == -1) { - fos = new httpdBoundedSizeOutputStream(fos, sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE)); - } else if (contentLength > sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE)) { - this.log.logInfo("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE) + " bytes."); + if (contentLength == -1 || maxFileSize == -1) { + if(maxFileSize == -1) { + // unlimited + fos = new httpdByteCountOutputStream(fos); + } else { + // check filesize while loading page + fos = new httpdBoundedSizeOutputStream(fos, maxFileSize); + } + } else if (contentLength > maxFileSize) { + this.log.logInfo("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes."); sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED); return null; } @@ -322,7 +330,7 @@ public final class HTTPLoader { this.log.logInfo("CRAWLER Interruption detected because of server shutdown."); failreason = ErrorURL.DENIED_SERVER_SHUTDOWN; } else if (e instanceof httpdLimitExceededException) { - this.log.logWarning("CRAWLER Max file size limit '" + sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE) + "' exceeded while downloading URL " + entry.url()); + this.log.logWarning("CRAWLER Max file size limit '" + maxFileSize + "' exceeded while downloading URL " + entry.url()); failreason = ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED; } else if (e instanceof MalformedURLException) { this.log.logWarning("CRAWLER Malformed URL '" + entry.url().toString() + "' detected. "); diff --git a/source/de/anomic/http/httpdBoundedSizeOutputStream.java b/source/de/anomic/http/httpdBoundedSizeOutputStream.java index 872c310d6..602e85b57 100644 --- a/source/de/anomic/http/httpdBoundedSizeOutputStream.java +++ b/source/de/anomic/http/httpdBoundedSizeOutputStream.java @@ -48,7 +48,7 @@ import java.io.OutputStream; public class httpdBoundedSizeOutputStream extends httpdByteCountOutputStream { - protected long maxSize = 0; + protected final long maxSize; public httpdBoundedSizeOutputStream(OutputStream outputStream, long sizeLimit) { this(outputStream,0,sizeLimit); diff --git a/source/de/anomic/tools/diskUsage.java b/source/de/anomic/tools/diskUsage.java index 544cdd879..2b7ba7bf5 100644 --- a/source/de/anomic/tools/diskUsage.java +++ b/source/de/anomic/tools/diskUsage.java @@ -499,13 +499,11 @@ nextLine: while((line = buffer.readLine()) != null) { output.add(line); } - log.logInfo("logpoint 4 output done of '"+ name +"'"); done = true; - } catch(final IOException ix) { log.logWarning("logpoint 5 " + ix.getMessage());} + } catch(final IOException ix) { log.logWarning("logpoint 4 " + ix.getMessage());} } public List getOutput(){ - log.logInfo("logpoint 6 getOutput() of '"+ name +"' requested"); while(!isDone()) { try { Thread.sleep(1);