Respect maxFileSize limit also when streaming HTTP and when relevant.

Constraint applied consistently with HTTP content full load in byte
array.
pull/127/head
luccioman 8 years ago
parent 4b72b29ea2
commit 433bdb7c0d

@ -28,6 +28,7 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.fileupload.util.LimitedInputStream;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
@ -45,6 +46,7 @@ import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.kelondro.io.ByteCount;
import net.yacy.kelondro.util.Formatter;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
@ -209,7 +211,7 @@ public final class HTTPLoader {
* When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
*/
long contentLength = client.getHttpResponse().getEntity().getContentLength();
final InputStream contentStream;
InputStream contentStream;
if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
byte[] content = null;
try {
@ -223,12 +225,29 @@ public final class HTTPLoader {
contentStream = new ByteArrayInputStream(content);
} else {
/*
* Content length may already be known now : check it before opening a stream
*/
if (maxFileSize >= 0 && contentLength > maxFileSize) {
throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
}
/*
* Create a HTTPInputStream delegating to
* client.getContentstream(). Close method will ensure client is
* properly closed.
*/
contentStream = new HTTPInputStream(client);
/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
if(maxFileSize >= 0) {
contentStream = new LimitedInputStream(contentStream, maxFileSize) {
@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException(
"Content to download exceed maximum value of " + Formatter.bytesToString(pSizeMax));
}
};
}
}
return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);

@ -469,12 +469,12 @@ public final class LoaderDispatcher {
* @param cacheStrategy cache strategy to use
* @param blacklistType black list
* @param agent agent identification for HTTP requests
* @param maxFileSize max file size to load. -1 means no limit.
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when url is malformed or blacklisted
*/
public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy,
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final int maxFileSize = protocolMaxFileSize(request.url());
BlacklistType blacklistType, final ClientIdentification.Agent agent, final int maxFileSize) throws IOException {
StreamResponse response;
Semaphore check = this.loaderSteering.get(request.url());
@ -509,6 +509,21 @@ public final class LoaderDispatcher {
return response;
}
/**
* Open the URL as an InputStream from the web or the cache. Apply the default per protocol configured maximum file size limit.
* @param request must be not null
* @param cacheStrategy cache strategy to use
* @param blacklistType black list
* @param agent agent identification for HTTP requests
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when url is malformed or blacklisted
*/
public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy,
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final int maxFileSize = protocolMaxFileSize(request.url());
return this.openInputStream(request, cacheStrategy, blacklistType, agent, maxFileSize);
}
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure {
@ -564,7 +579,8 @@ public final class LoaderDispatcher {
* @return on parsed document or null when an error occurred while parsing
* @throws IOException when the content can not be fetched or no parser support it
*/
public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy,
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
// load resource
Request request = request(location, true, false);
final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent);

@ -113,7 +113,7 @@ public class ImageViewer {
* image url.
* @return an open input stream instance (don't forget to close it).
* @throws IOException
* when a read/write error occured.
* when a read/write error occurred.
*/
public InputStream openInputStream(final serverObjects post, final LoaderDispatcher loader,
final boolean auth, DigestURL url) throws IOException {
@ -123,8 +123,10 @@ public class ImageViewer {
String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
: ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
/* We do not apply here the crawler max file size limit,
* as the purpose of this stream is not to be parsed and indexed but to be directly rendered */
final StreamResponse response = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
BlacklistType.SEARCH, agent);
BlacklistType.SEARCH, agent, -1);
inStream = response.getContentStream();
} catch (final IOException e) {
/** No need to log full stack trace (in most cases resource is not available because of a network error) */

Loading…
Cancel
Save