Respect maxFileSize limit also when streaming HTTP and when relevant.

Constraint applied consistently with HTTP content full load in byte array.
8 years ago · 433bdb7c0d
parent 4b72b29ea2
commit 433bdb7c0d
3 changed files with 43 additions and 6 deletions
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -28,6 +28,7 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;

+import org.apache.commons.fileupload.util.LimitedInputStream;
 import org.apache.http.HttpStatus;
 import org.apache.http.StatusLine;

@ -45,6 +46,7 @@ import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.Latency;
 import net.yacy.kelondro.io.ByteCount;
+import net.yacy.kelondro.util.Formatter;
 import net.yacy.repository.Blacklist.BlacklistType;
 import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
@ -209,7 +211,7 @@ public final class HTTPLoader {
 			 * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
 			 */
 			long contentLength = client.getHttpResponse().getEntity().getContentLength();
-			final InputStream contentStream;
+			InputStream contentStream;
 			if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
 				byte[] content = null;
 				try {
@ -223,12 +225,29 @@ public final class HTTPLoader {

 				contentStream = new ByteArrayInputStream(content);
 			} else {
+				/*
+				 * Content length may already be known now : check it before opening a stream
+				 */
+				if (maxFileSize >= 0 && contentLength > maxFileSize) {
+					throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
+				}
 				/*
 				 * Create a HTTPInputStream delegating to
 				 * client.getContentstream(). Close method will ensure client is
 				 * properly closed.
 				 */
 				contentStream = new HTTPInputStream(client);
+				/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
+				if(maxFileSize >= 0) {
+					contentStream = new LimitedInputStream(contentStream, maxFileSize) {
+
+						@Override
+						protected void raiseError(long pSizeMax, long pCount) throws IOException {
+							throw new IOException(
+									"Content to download exceed maximum value of " + Formatter.bytesToString(pSizeMax));
+						}
+					};
+				}
 			}

 			return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -469,12 +469,12 @@ public final class LoaderDispatcher {
     * @param cacheStrategy cache strategy to use
     * @param blacklistType black list
     * @param agent agent identification for HTTP requests
+     * @param maxFileSize max file size to load. -1 means no limit.
     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
     * @throws IOException when url is malformed or blacklisted
     */
 	public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy,
-			BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
-		final int maxFileSize = protocolMaxFileSize(request.url());
+			BlacklistType blacklistType, final ClientIdentification.Agent agent, final int maxFileSize) throws IOException {
 		StreamResponse response;

 		Semaphore check = this.loaderSteering.get(request.url());
@ -509,6 +509,21 @@ public final class LoaderDispatcher {

 		return response;
 	}
+    
+    /**
+     * Open the URL as an InputStream from the web or the cache. Apply the default per protocol configured maximum file size limit.
+     * @param request must be not null
+     * @param cacheStrategy cache strategy to use
+     * @param blacklistType black list
+     * @param agent agent identification for HTTP requests
+     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
+     * @throws IOException when url is malformed or blacklisted
+     */
+	public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy,
+			BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
+		final int maxFileSize = protocolMaxFileSize(request.url());
+		return this.openInputStream(request, cacheStrategy, blacklistType, agent, maxFileSize);
+	}

    public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure {

@ -564,7 +579,8 @@ public final class LoaderDispatcher {
     * @return on parsed document or null when an error occurred while parsing
     * @throws IOException when the content can not be fetched or no parser support it
     */
-    public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
+    public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, 
+    		BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
        // load resource
        Request request = request(location, true, false);
        final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent);
--- a/source/net/yacy/visualization/ImageViewer.java
+++ b/source/net/yacy/visualization/ImageViewer.java
@ -113,7 +113,7 @@ public class ImageViewer {
 	 *            image url.
 	 * @return an open input stream instance (don't forget to close it).
 	 * @throws IOException
-	 *             when a read/write error occured. 
+	 *             when a read/write error occurred. 
 	 */
 	public InputStream openInputStream(final serverObjects post, final LoaderDispatcher loader,
 			final boolean auth, DigestURL url) throws IOException {
@ -123,8 +123,10 @@ public class ImageViewer {
 				String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
 						: ClientIdentification.yacyInternetCrawlerAgentName);
 				ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
+				/* We do not apply here the crawler max file size limit, 
+				 * as the purpose of this stream is not to be parsed and indexed but to be directly rendered */
 				final StreamResponse response = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
-						BlacklistType.SEARCH, agent);
+						BlacklistType.SEARCH, agent, -1);
 				inStream = response.getContentStream();
 			} catch (final IOException e) {
 				/** No need to log full stack trace (in most cases resource is not available because of a network error) */