diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index ebb064048..65369ce3a 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -248,9 +248,21 @@ public final class CrawlWorker extends AbstractCrawlWorker { // creating an output stream fos = new FileOutputStream(cacheFile); + // getting content length + long contentLength = (res.isGzipped()) ? res.getGzippedLength() : res.responseHeader.contentLength(); + + // check if the file is too large to keep it in memory + if (this.keepInMemory) { + // if the content length is unknown or larger than 5MB we + // do not keep resource in memory + // TODO: make MAX_KEEP_IN_MEMORY_SIZE configureble + if ((contentLength == -1) || (contentLength > 5 * 1024 * 1024)) { + this.keepInMemory = false; + } + } + // check the maximum allowed file size - if (this.maxFileSize > -1) { - long contentLength = (res.isGzipped()) ? res.getGzippedLength() : res.responseHeader.contentLength(); + if (this.maxFileSize > -1) { if (contentLength == -1) { fos = new httpdBoundedSizeOutputStream(fos,this.maxFileSize); } else if (contentLength > this.maxFileSize) {