From 1e84956721f53d7a794af6c9a6cad94d398f7563 Mon Sep 17 00:00:00 2001 From: luccioman Date: Tue, 11 Jul 2017 09:04:23 +0200 Subject: [PATCH] Support loading local files with a per request specified maximum size. Consistently with the HTTP loader implementation. --- .../yacy/crawler/retrieval/FileLoader.java | 42 ++++++++++++++++--- .../net/yacy/repository/LoaderDispatcher.java | 14 +++++-- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index 5c07a5cd5..e18599ab8 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -31,6 +31,8 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; +import org.apache.commons.fileupload.util.LimitedInputStream; + import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; @@ -48,6 +50,9 @@ import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; public class FileLoader { + + /** Default maximum file size allowed for the crawler */ + public static final int DEFAULT_MAXFILESIZE = 100000000; private final Switchboard sb; private final ConcurrentLog log; @@ -56,7 +61,7 @@ public class FileLoader { public FileLoader(final Switchboard sb, final ConcurrentLog log) { this.sb = sb; this.log = log; - this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l); + this.maxFileSize = sb.getConfigInt("crawler.file.maxFileSize", DEFAULT_MAXFILESIZE); } /** @@ -77,13 +82,14 @@ public class FileLoader { } /** - * Open a stream on the requested file + * Open a stream on the requested file. When actual file size is over maxBytes, return a stream on metadata only (URL tokens). * * @param request the request to process * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens + * @param maxBytes max file size to load. -1 means no limit. * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. */ - public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException { + public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable, final int maxBytes) throws IOException { DigestURL url = request.url(); if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol()); @@ -134,12 +140,13 @@ public class FileLoader { long size; try { size = url.length(); + responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(size)); } catch (final Exception e) { size = -1; } String parserError = null; if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || - (size > this.maxFileSize && this.maxFileSize >= 0)) { + (size > maxBytes && maxBytes >= 0)) { // we know that we cannot process that file before loading // only the metadata is returned @@ -163,9 +170,21 @@ public class FileLoader { } // load the resource - final InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); + InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); + + if(size < 0 && maxBytes >= 0) { + /* If content length is unknown for some reason, let's apply now the eventual size restriction */ + is = new LimitedInputStream(is, maxBytes) { + + @Override + protected void raiseError(long pSizeMax, long pCount) throws IOException { + throw new IOException( + "Too big file in File crawler for URL " + request.url().toString()); + } + }; + } - // create response with loaded content + // create response with stream open on content final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); Response response = new Response( request, @@ -176,4 +195,15 @@ public class FileLoader { null); return new StreamResponse(response, is); } + + /** + * Open a stream on the requested file + * + * @param request the request to process + * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens + * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. + */ + public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException { + return openInputStream(request, acceptOnlyParseable, this.maxFileSize); + } } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index bc3a7d997..d55252544 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -400,7 +400,7 @@ public final class LoaderDispatcher { } else if (protocol.equals("smb")) { response = this.smbLoader.openInputStream(request, true); } else if (protocol.equals("file")) { - response = this.fileLoader.openInputStream(request, true); + response = this.fileLoader.openInputStream(request, true, maxFileSize); } else { throw new IOException("Unsupported protocol '" + protocol + "' in url " + url); } @@ -444,12 +444,18 @@ public final class LoaderDispatcher { * @return the crawler configured maximum size allowed to load for the protocol of the URL */ public int protocolMaxFileSize(final DigestURL url) { - if (url.isHTTP() || url.isHTTPS()) + if (url.isHTTP() || url.isHTTPS()) { return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - if (url.isFTP()) + } + if (url.isFTP()) { return this.sb.getConfigInt("crawler.ftp.maxFileSize", (int) FTPLoader.DEFAULT_MAXFILESIZE); - if (url.isSMB()) + } + if (url.isSMB()) { return this.sb.getConfigInt("crawler.smb.maxFileSize", (int) SMBLoader.DEFAULT_MAXFILESIZE); + } + if(url.isFile()) { + return this.sb.getConfigInt("crawler.file.maxFileSize", FileLoader.DEFAULT_MAXFILESIZE); + } return Integer.MAX_VALUE; }