Support loading local files with a per request specified maximum size.

Consistently with the HTTP loader implementation.
8 years ago · 1e84956721
parent f369679d1c
commit 1e84956721
2 changed files with 46 additions and 10 deletions
--- a/source/net/yacy/crawler/retrieval/FileLoader.java
+++ b/source/net/yacy/crawler/retrieval/FileLoader.java
@ -31,6 +31,8 @@ import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;

+import org.apache.commons.fileupload.util.LimitedInputStream;
+
 import net.yacy.cora.document.analysis.Classification;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.encoding.UTF8;
@ -48,6 +50,9 @@ import net.yacy.kelondro.util.FileUtils;
 import net.yacy.search.Switchboard;

 public class FileLoader {
+	
+	/** Default maximum file size allowed for the crawler */
+    public static final int DEFAULT_MAXFILESIZE = 100000000;

    private final Switchboard sb;
    private final ConcurrentLog log;
@ -56,7 +61,7 @@ public class FileLoader {
    public FileLoader(final Switchboard sb, final ConcurrentLog log) {
        this.sb = sb;
        this.log = log;
-        this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
+        this.maxFileSize = sb.getConfigInt("crawler.file.maxFileSize", DEFAULT_MAXFILESIZE);
    }

    /**
@ -77,13 +82,14 @@ public class FileLoader {
    }
    
    /**
-     * Open a stream on the requested file
+     * Open a stream on the requested file. When actual file size is over maxBytes, return a stream on metadata only (URL tokens).
     *
     * @param request the request to process
     * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
+     * @param maxBytes max file size to load. -1 means no limit.
     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
     */
-    public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
+    public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable, final int maxBytes) throws IOException {
        DigestURL url = request.url();
        if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol());

@ -134,12 +140,13 @@ public class FileLoader {
        long size;
        try {
            size = url.length();
+            responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(size));
        } catch (final Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
-            (size > this.maxFileSize && this.maxFileSize >= 0)) {
+            (size > maxBytes && maxBytes >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned

@ -163,9 +170,21 @@ public class FileLoader {
        }

        // load the resource
-        final InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
+        InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
+        
+        if(size < 0 && maxBytes >= 0) {
+			/* If content length is unknown for some reason, let's apply now the eventual size restriction */
+        	is = new LimitedInputStream(is, maxBytes) {
+
+			@Override
+			protected void raiseError(long pSizeMax, long pCount) throws IOException {
+				throw new IOException(
+						"Too big file in File crawler for URL " + request.url().toString());
+				}
+			};
+        }

-        // create response with loaded content
+        // create response with stream open on content
        final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
        Response response = new Response(
                request,
@ -176,4 +195,15 @@ public class FileLoader {
                null);
        return new StreamResponse(response, is);
    }
+    
+    /**
+     * Open a stream on the requested file
+     *
+     * @param request the request to process
+     * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
+     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
+     */
+    public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
+    	return openInputStream(request, acceptOnlyParseable,  this.maxFileSize);
+    }
 }
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -400,7 +400,7 @@ public final class LoaderDispatcher {
        } else if (protocol.equals("smb")) {
            response = this.smbLoader.openInputStream(request, true);
        } else if (protocol.equals("file")) {
-            response = this.fileLoader.openInputStream(request, true);
+            response = this.fileLoader.openInputStream(request, true, maxFileSize);
        } else {
            throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
        }
@ -444,12 +444,18 @@ public final class LoaderDispatcher {
 	 * @return the crawler configured maximum size allowed to load for the protocol of the URL 
 	 */
    public int protocolMaxFileSize(final DigestURL url) {
-    	if (url.isHTTP() || url.isHTTPS())
+    	if (url.isHTTP() || url.isHTTPS()) {
    		return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
-    	if (url.isFTP())
+    	}
+    	if (url.isFTP()) {
    		return this.sb.getConfigInt("crawler.ftp.maxFileSize", (int) FTPLoader.DEFAULT_MAXFILESIZE);
-    	if (url.isSMB())
+    	}
+    	if (url.isSMB()) {
    		return this.sb.getConfigInt("crawler.smb.maxFileSize", (int) SMBLoader.DEFAULT_MAXFILESIZE);
+    	}
+    	if(url.isFile()) {
+    		return this.sb.getConfigInt("crawler.file.maxFileSize", FileLoader.DEFAULT_MAXFILESIZE);
+    	}
    	return Integer.MAX_VALUE;
    }