From 452a17a8d5edcfda47e68f3e326e8e1d40f27d34 Mon Sep 17 00:00:00 2001 From: luccioman Date: Wed, 12 Jul 2017 00:13:24 +0200 Subject: [PATCH] Finer control on bounded input streams with custom stream implementation --- .../yacy/cora/util/StreamLimitException.java | 47 +++++ .../cora/util/StrictLimitInputStream.java | 193 ++++++++++++++++++ .../yacy/crawler/retrieval/FileLoader.java | 13 +- .../yacy/crawler/retrieval/HTTPLoader.java | 12 +- source/net/yacy/document/TextParser.java | 11 +- .../document/parser/GenericXMLParser.java | 19 +- 6 files changed, 255 insertions(+), 40 deletions(-) create mode 100644 source/net/yacy/cora/util/StreamLimitException.java create mode 100644 source/net/yacy/cora/util/StrictLimitInputStream.java diff --git a/source/net/yacy/cora/util/StreamLimitException.java b/source/net/yacy/cora/util/StreamLimitException.java new file mode 100644 index 000000000..f10f9d654 --- /dev/null +++ b/source/net/yacy/cora/util/StreamLimitException.java @@ -0,0 +1,47 @@ +// StreamLimitException.java +// --------------------------- +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.cora.util; + +import java.io.IOException; + +/** + * Used to indicate a limit on a stream has been reached or exceeded + * @author luccioman + * + */ +public class StreamLimitException extends IOException { + + /** Generated serialization ID */ + private static final long serialVersionUID = -804446385126524902L; + + public StreamLimitException() { + super(); + } + + public StreamLimitException(String message) { + super(message); + } + + + +} diff --git a/source/net/yacy/cora/util/StrictLimitInputStream.java b/source/net/yacy/cora/util/StrictLimitInputStream.java new file mode 100644 index 000000000..089146df0 --- /dev/null +++ b/source/net/yacy/cora/util/StrictLimitInputStream.java @@ -0,0 +1,193 @@ +// StrictLimitInputStream.java +// --------------------------- +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.cora.util; + +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; + +import net.yacy.kelondro.util.Formatter; + +/** + * Strictly limit the number of bytes consumed on a wrapped input stream : + * doesn't allow exceeding the limit and throw an exception when it is reached. + * See also some alternatives to consider : + * + * + * @author luccioman + */ +public class StrictLimitInputStream extends FilterInputStream { + + /** + * Strict maximum bytes amount to consume on the wrapped stream. An + * exception is raised once consumed bytes is exactly equals to this value. + */ + private final long maxBytes; + + /** The current position in the wrapped stream */ + private long position = 0; + + /** The marked position */ + private long mark = -1; + + /** + * The error message to use when a StreamLimitException is eventually raised + */ + private final String limitErrorMessage; + + /** + * Wrap the given input stream and limit read bytes to maxBytes. + * + * @param inStream + * the input stream to wrap. Must not be null. + * @param maxBytes + * the maximum number of bytes to consume on the inStream. Must + * be greater or equals than zero. + * @throws IllegalArgumentException + * when inStream is null, or maxBytes is lower than zero + */ + public StrictLimitInputStream(final InputStream inStream, final long maxBytes) { + this(inStream, maxBytes, Formatter.bytesToString(maxBytes) + " limit has been reached"); + } + + /** + * Wrap the given input stream and limit read bytes to maxBytes. + * + * @param inStream + * the input stream to wrap. Must not be null. + * @param maxBytes + * the maximum number of bytes to consume on the inStream. Must + * be greater or equals than zero. + * @param limitErrorMessage + * the custom error message to use when a StreamLimitException is + * eventually raised. May be null. + * @throws IllegalArgumentException + * when inStream is null, or maxBytes is lower than zero + */ + public StrictLimitInputStream(final InputStream inStream, final long maxBytes, final String limitErrorMessage) { + super(inStream); + if (inStream == null) { + throw new IllegalArgumentException("inStream parameter must not be null"); + } + if (maxBytes < 0) { + throw new IllegalArgumentException("maxBytes parameter must be greater or equals to zero"); + } + this.maxBytes = maxBytes; + this.limitErrorMessage = limitErrorMessage; + } + + /** + * {@inheritDoc} + * + * @throws StreamLimitException + * when the maxBytes limit has been reached + * @throws IOException + * when an I/O error occurs + */ + @Override + public int read() throws IOException { + if (this.position >= this.maxBytes) { + throw new StreamLimitException(this.limitErrorMessage); + } + final int result = this.in.read(); + this.position++; + return result; + } + + /** + * {@inheritDoc} + * + * @throws StreamLimitException + * when the maxBytes limit has been reached + */ + @Override + public int read(final byte[] b) throws IOException { + return this.read(b, 0, b.length); + } + + /** + * {@inheritDoc} + * + * @throws StreamLimitException + * when the maxBytes limit has been reached + */ + @Override + public int read(final byte[] b, final int off, final int len) throws IOException, StreamLimitException { + if (this.position >= this.maxBytes) { + throw new StreamLimitException(this.limitErrorMessage); + } + final long maxToRead = Math.min(len, this.maxBytes - this.position); + final int nbRead = this.in.read(b, off, (int) maxToRead); + + if (nbRead > 0) { + this.position += nbRead; + } + return nbRead; + } + + /** + * {@inheritDoc} + * + * @throws StreamLimitException + * when the maxBytes limit has been reached + */ + @Override + public long skip(final long n) throws IOException { + if (this.position >= this.maxBytes) { + throw new StreamLimitException(this.limitErrorMessage); + } + final long toSkip = Math.min(n, this.maxBytes - this.position); + final long nbSkipped = this.in.skip(toSkip); + this.position += nbSkipped; + return nbSkipped; + } + + /* We do not override available() even when position has reached maxBytes : limit + reached must be signaled to the caller trough a StreamLimitException + when reading */ + + @Override + public synchronized void reset() throws IOException { + this.in.reset(); + /* + * Rely on the wrapped input stream to check and throw an exception if + * the mark is invalid + */ + this.position = this.mark; + } + + @Override + public synchronized void mark(final int readlimit) { + this.in.mark(readlimit); + this.mark = this.position; + } +} diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index e18599ab8..7d24528b5 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -31,8 +31,6 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; -import org.apache.commons.fileupload.util.LimitedInputStream; - import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; @@ -44,6 +42,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.cora.util.StrictLimitInputStream; import net.yacy.crawler.data.CrawlProfile; import net.yacy.document.TextParser; import net.yacy.kelondro.util.FileUtils; @@ -174,14 +173,8 @@ public class FileLoader { if(size < 0 && maxBytes >= 0) { /* If content length is unknown for some reason, let's apply now the eventual size restriction */ - is = new LimitedInputStream(is, maxBytes) { - - @Override - protected void raiseError(long pSizeMax, long pCount) throws IOException { - throw new IOException( - "Too big file in File crawler for URL " + request.url().toString()); - } - }; + is = new StrictLimitInputStream(is, maxBytes, + "Too big file in File crawler for URL " + request.url().toString()); } // create response with stream open on content diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 1c88ff313..ad9c9034c 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -28,7 +28,6 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import org.apache.commons.fileupload.util.LimitedInputStream; import org.apache.http.HttpStatus; import org.apache.http.StatusLine; @@ -41,6 +40,7 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.HTTPInputStream; +import net.yacy.cora.util.StrictLimitInputStream; import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; @@ -239,14 +239,8 @@ public final class HTTPLoader { contentStream = new HTTPInputStream(client); /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */ if(maxFileSize >= 0) { - contentStream = new LimitedInputStream(contentStream, maxFileSize) { - - @Override - protected void raiseError(long pSizeMax, long pCount) throws IOException { - throw new IOException( - "Content to download exceed maximum value of " + Formatter.bytesToString(pSizeMax)); - } - }; + contentStream = new StrictLimitInputStream(contentStream, maxFileSize, + "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize)); } } diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index f3dcd89c5..bb6c6fb2b 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -34,13 +34,13 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import org.apache.commons.fileupload.util.LimitedInputStream; import org.apache.commons.io.input.CloseShieldInputStream; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.CommonPattern; +import net.yacy.cora.util.StrictLimitInputStream; import net.yacy.document.parser.GenericXMLParser; import net.yacy.document.parser.apkParser; import net.yacy.document.parser.audioTagParser; @@ -396,14 +396,7 @@ public final class TextParser { docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes); } else { /* Parser do not support partial parsing within limits : let's control it here*/ - InputStream limitedSource = new LimitedInputStream(sourceStream, maxBytes) { - - @Override - protected void raiseError(long pSizeMax, long pCount) throws IOException { - throw new IOException("Reached maximum bytes to parse : " + maxBytes); - - } - }; + InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes); docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource); } return docs; diff --git a/source/net/yacy/document/parser/GenericXMLParser.java b/source/net/yacy/document/parser/GenericXMLParser.java index 0d37c8c1f..0673260e6 100644 --- a/source/net/yacy/document/parser/GenericXMLParser.java +++ b/source/net/yacy/document/parser/GenericXMLParser.java @@ -22,7 +22,6 @@ package net.yacy.document.parser; -import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Date; @@ -35,7 +34,6 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import org.apache.commons.fileupload.util.LimitedInputStream; import org.apache.commons.io.input.XmlStreamReader; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -43,6 +41,8 @@ import org.xml.sax.SAXException; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.util.StreamLimitException; +import net.yacy.cora.util.StrictLimitInputStream; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -172,14 +172,7 @@ public class GenericXMLParser extends AbstractParser implements Parser { final Set detectedURLs = new HashSet<>(); final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs, maxLinks); - InputStream limitedSource = new LimitedInputStream(source, maxBytes) { - - @Override - protected void raiseError(long pSizeMax, long pCount) throws IOException { - throw new IOException(new SizeLimitExceededException("Reached maximum bytes to parse : " + maxBytes)); - - } - }; + StrictLimitInputStream limitedSource = new StrictLimitInputStream(source, maxBytes); /* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */ @@ -191,12 +184,14 @@ public class GenericXMLParser extends AbstractParser implements Parser { boolean limitExceeded = false; try { saxParser.parse(saxSource, saxHandler); - } catch(SAXException | IOException e) { + } catch(SAXException e) { if(!(e.getCause() instanceof SizeLimitExceededException)) { - /* Only transmit to upper layer exceptions that are not caused by the maxLinks or maxBytes limits being reached */ + /* Only transmit to upper layer exceptions that are not caused by the maxLinks limit being reached */ throw e; } limitExceeded = true; + } catch(StreamLimitException e) { + limitExceeded = true; }