Finer control on bounded input streams with custom stream implementation

pull/127/head
luccioman 8 years ago
parent f8f1959ebb
commit 452a17a8d5

@ -0,0 +1,47 @@
// StreamLimitException.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.cora.util;
import java.io.IOException;
/**
* Used to indicate a limit on a stream has been reached or exceeded
* @author luccioman
*
*/
public class StreamLimitException extends IOException {
/** Generated serialization ID */
private static final long serialVersionUID = -804446385126524902L;
public StreamLimitException() {
super();
}
public StreamLimitException(String message) {
super(message);
}
}

@ -0,0 +1,193 @@
// StrictLimitInputStream.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.cora.util;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import net.yacy.kelondro.util.Formatter;
/**
* Strictly limit the number of bytes consumed on a wrapped input stream :
* doesn't allow exceeding the limit and throw an exception when it is reached.
* See also some alternatives to consider :
* <ul>
* <li>org.apache.commons.fileupload.util.LimitedInputStream : check the limit
* only after reading, thus eventually allowing more bytes than the limit to be
* read. Doesn't properly implement mark() and reset() (when resetting, total
* count of consumed bytes is not reset)</li>
* <li>com.google.common.io.ByteStreams.LimitedInputStream : doesn't throw an
* exception on read() when the limit has been reached</li>
* <li>org.apache.commons.io.input.BoundedInputStream : doesn't throw an
* exception on read() when the limit has been reached</li>
* </ul>
*
* @author luccioman
*/
public class StrictLimitInputStream extends FilterInputStream {
/**
* Strict maximum bytes amount to consume on the wrapped stream. An
* exception is raised once consumed bytes is exactly equals to this value.
*/
private final long maxBytes;
/** The current position in the wrapped stream */
private long position = 0;
/** The marked position */
private long mark = -1;
/**
* The error message to use when a StreamLimitException is eventually raised
*/
private final String limitErrorMessage;
/**
* Wrap the given input stream and limit read bytes to maxBytes.
*
* @param inStream
* the input stream to wrap. Must not be null.
* @param maxBytes
* the maximum number of bytes to consume on the inStream. Must
* be greater or equals than zero.
* @throws IllegalArgumentException
* when inStream is null, or maxBytes is lower than zero
*/
public StrictLimitInputStream(final InputStream inStream, final long maxBytes) {
this(inStream, maxBytes, Formatter.bytesToString(maxBytes) + " limit has been reached");
}
/**
* Wrap the given input stream and limit read bytes to maxBytes.
*
* @param inStream
* the input stream to wrap. Must not be null.
* @param maxBytes
* the maximum number of bytes to consume on the inStream. Must
* be greater or equals than zero.
* @param limitErrorMessage
* the custom error message to use when a StreamLimitException is
* eventually raised. May be null.
* @throws IllegalArgumentException
* when inStream is null, or maxBytes is lower than zero
*/
public StrictLimitInputStream(final InputStream inStream, final long maxBytes, final String limitErrorMessage) {
super(inStream);
if (inStream == null) {
throw new IllegalArgumentException("inStream parameter must not be null");
}
if (maxBytes < 0) {
throw new IllegalArgumentException("maxBytes parameter must be greater or equals to zero");
}
this.maxBytes = maxBytes;
this.limitErrorMessage = limitErrorMessage;
}
/**
* {@inheritDoc}
*
* @throws StreamLimitException
* when the maxBytes limit has been reached
* @throws IOException
* when an I/O error occurs
*/
@Override
public int read() throws IOException {
if (this.position >= this.maxBytes) {
throw new StreamLimitException(this.limitErrorMessage);
}
final int result = this.in.read();
this.position++;
return result;
}
/**
* {@inheritDoc}
*
* @throws StreamLimitException
* when the maxBytes limit has been reached
*/
@Override
public int read(final byte[] b) throws IOException {
return this.read(b, 0, b.length);
}
/**
* {@inheritDoc}
*
* @throws StreamLimitException
* when the maxBytes limit has been reached
*/
@Override
public int read(final byte[] b, final int off, final int len) throws IOException, StreamLimitException {
if (this.position >= this.maxBytes) {
throw new StreamLimitException(this.limitErrorMessage);
}
final long maxToRead = Math.min(len, this.maxBytes - this.position);
final int nbRead = this.in.read(b, off, (int) maxToRead);
if (nbRead > 0) {
this.position += nbRead;
}
return nbRead;
}
/**
* {@inheritDoc}
*
* @throws StreamLimitException
* when the maxBytes limit has been reached
*/
@Override
public long skip(final long n) throws IOException {
if (this.position >= this.maxBytes) {
throw new StreamLimitException(this.limitErrorMessage);
}
final long toSkip = Math.min(n, this.maxBytes - this.position);
final long nbSkipped = this.in.skip(toSkip);
this.position += nbSkipped;
return nbSkipped;
}
/* We do not override available() even when position has reached maxBytes : limit
reached must be signaled to the caller trough a StreamLimitException
when reading */
@Override
public synchronized void reset() throws IOException {
this.in.reset();
/*
* Rely on the wrapped input stream to check and throw an exception if
* the mark is invalid
*/
this.position = this.mark;
}
@Override
public synchronized void mark(final int readlimit) {
this.in.mark(readlimit);
this.mark = this.position;
}
}

@ -31,8 +31,6 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.commons.fileupload.util.LimitedInputStream;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
@ -44,6 +42,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.TextParser;
import net.yacy.kelondro.util.FileUtils;
@ -174,14 +173,8 @@ public class FileLoader {
if(size < 0 && maxBytes >= 0) {
/* If content length is unknown for some reason, let's apply now the eventual size restriction */
is = new LimitedInputStream(is, maxBytes) {
@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException(
"Too big file in File crawler for URL " + request.url().toString());
}
};
is = new StrictLimitInputStream(is, maxBytes,
"Too big file in File crawler for URL " + request.url().toString());
}
// create response with stream open on content

@ -28,7 +28,6 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.fileupload.util.LimitedInputStream;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
@ -41,6 +40,7 @@ import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.HTTPInputStream;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
@ -239,14 +239,8 @@ public final class HTTPLoader {
contentStream = new HTTPInputStream(client);
/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
if(maxFileSize >= 0) {
contentStream = new LimitedInputStream(contentStream, maxFileSize) {
@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException(
"Content to download exceed maximum value of " + Formatter.bytesToString(pSizeMax));
}
};
contentStream = new StrictLimitInputStream(contentStream, maxFileSize,
"Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize));
}
}

@ -34,13 +34,13 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.fileupload.util.LimitedInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.parser.GenericXMLParser;
import net.yacy.document.parser.apkParser;
import net.yacy.document.parser.audioTagParser;
@ -396,14 +396,7 @@ public final class TextParser {
docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
} else {
/* Parser do not support partial parsing within limits : let's control it here*/
InputStream limitedSource = new LimitedInputStream(sourceStream, maxBytes) {
@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException("Reached maximum bytes to parse : " + maxBytes);
}
};
InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
}
return docs;

@ -22,7 +22,6 @@
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
@ -35,7 +34,6 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.fileupload.util.LimitedInputStream;
import org.apache.commons.io.input.XmlStreamReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@ -43,6 +41,8 @@ import org.xml.sax.SAXException;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.StreamLimitException;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -172,14 +172,7 @@ public class GenericXMLParser extends AbstractParser implements Parser {
final Set<AnchorURL> detectedURLs = new HashSet<>();
final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs, maxLinks);
InputStream limitedSource = new LimitedInputStream(source, maxBytes) {
@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException(new SizeLimitExceededException("Reached maximum bytes to parse : " + maxBytes));
}
};
StrictLimitInputStream limitedSource = new StrictLimitInputStream(source, maxBytes);
/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
* (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */
@ -191,12 +184,14 @@ public class GenericXMLParser extends AbstractParser implements Parser {
boolean limitExceeded = false;
try {
saxParser.parse(saxSource, saxHandler);
} catch(SAXException | IOException e) {
} catch(SAXException e) {
if(!(e.getCause() instanceof SizeLimitExceededException)) {
/* Only transmit to upper layer exceptions that are not caused by the maxLinks or maxBytes limits being reached */
/* Only transmit to upper layer exceptions that are not caused by the maxLinks limit being reached */
throw e;
}
limitExceeded = true;
} catch(StreamLimitException e) {
limitExceeded = true;
}

Loading…
Cancel
Save