Support loading local files with a per request specified maximum size.

Consistently with the HTTP loader implementation.
pull/127/head
luccioman 8 years ago
parent f369679d1c
commit 1e84956721

@ -31,6 +31,8 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.commons.fileupload.util.LimitedInputStream;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
@ -48,6 +50,9 @@ import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
public class FileLoader {
/** Default maximum file size allowed for the crawler */
public static final int DEFAULT_MAXFILESIZE = 100000000;
private final Switchboard sb;
private final ConcurrentLog log;
@ -56,7 +61,7 @@ public class FileLoader {
public FileLoader(final Switchboard sb, final ConcurrentLog log) {
this.sb = sb;
this.log = log;
this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
this.maxFileSize = sb.getConfigInt("crawler.file.maxFileSize", DEFAULT_MAXFILESIZE);
}
/**
@ -77,13 +82,14 @@ public class FileLoader {
}
/**
* Open a stream on the requested file
* Open a stream on the requested file. When actual file size is over maxBytes, return a stream on metadata only (URL tokens).
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
* @param maxBytes max file size to load. -1 means no limit.
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable, final int maxBytes) throws IOException {
DigestURL url = request.url();
if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol());
@ -134,12 +140,13 @@ public class FileLoader {
long size;
try {
size = url.length();
responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(size));
} catch (final Exception e) {
size = -1;
}
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > this.maxFileSize && this.maxFileSize >= 0)) {
(size > maxBytes && maxBytes >= 0)) {
// we know that we cannot process that file before loading
// only the metadata is returned
@ -163,9 +170,21 @@ public class FileLoader {
}
// load the resource
final InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
if(size < 0 && maxBytes >= 0) {
/* If content length is unknown for some reason, let's apply now the eventual size restriction */
is = new LimitedInputStream(is, maxBytes) {
@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException(
"Too big file in File crawler for URL " + request.url().toString());
}
};
}
// create response with loaded content
// create response with stream open on content
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
@ -176,4 +195,15 @@ public class FileLoader {
null);
return new StreamResponse(response, is);
}
/**
* Open a stream on the requested file
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
return openInputStream(request, acceptOnlyParseable, this.maxFileSize);
}
}

@ -400,7 +400,7 @@ public final class LoaderDispatcher {
} else if (protocol.equals("smb")) {
response = this.smbLoader.openInputStream(request, true);
} else if (protocol.equals("file")) {
response = this.fileLoader.openInputStream(request, true);
response = this.fileLoader.openInputStream(request, true, maxFileSize);
} else {
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
}
@ -444,12 +444,18 @@ public final class LoaderDispatcher {
* @return the crawler configured maximum size allowed to load for the protocol of the URL
*/
public int protocolMaxFileSize(final DigestURL url) {
if (url.isHTTP() || url.isHTTPS())
if (url.isHTTP() || url.isHTTPS()) {
return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
if (url.isFTP())
}
if (url.isFTP()) {
return this.sb.getConfigInt("crawler.ftp.maxFileSize", (int) FTPLoader.DEFAULT_MAXFILESIZE);
if (url.isSMB())
}
if (url.isSMB()) {
return this.sb.getConfigInt("crawler.smb.maxFileSize", (int) SMBLoader.DEFAULT_MAXFILESIZE);
}
if(url.isFile()) {
return this.sb.getConfigInt("crawler.file.maxFileSize", FileLoader.DEFAULT_MAXFILESIZE);
}
return Integer.MAX_VALUE;
}

Loading…
Cancel
Save