diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index a2126cb4f..1678544be 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -271,24 +271,35 @@ public final class TextParser { canStream = true; } } + } else if(sourceStream instanceof ByteArrayInputStream) { + /* Also check if we have a ByteArrayInputStream as source to prevent useless bytes duplication in a new byte array */ + canStream = true; } - // if we do not have more than one non generic parser or the content size is over MaxInt (2GB) or is over the totally available memory + // if we do not have more than one non generic parser, or the content size is over MaxInt (2GB), or is over the totally available memory, + // or stream is already in memory as a ByteArrayInputStream // then we use only stream-oriented parser. if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) { try { /* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource * and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */ int rewindSize = 10 * 1024; - final BufferedInputStream bufferedStream = new BufferedInputStream(sourceStream, rewindSize); + final InputStream markableStream; + if(sourceStream instanceof ByteArrayInputStream) { + /* No nead to use a wrapping buffered stream when the source is already entirely in memory. + * What's more, ByteArrayInputStream has no read limit when marking.*/ + markableStream = sourceStream; + } else { + markableStream = new BufferedInputStream(sourceStream, rewindSize); + } /* Mark now to allow resetting the buffered stream to the beginning of the stream */ - bufferedStream.mark(rewindSize); + markableStream.mark(rewindSize); /* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */ for(Parser parser : idioms) { /* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream * and so let us eventually reuse the same opened stream with other parsers on parser failure */ - CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(bufferedStream); + CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream); try { return parseSource(location, mimeType, parser, charset, scraper, timezoneOffset, @@ -296,7 +307,7 @@ public final class TextParser { } catch (Parser.Failure e) { /* Try to reset the marked stream. If the failed parser has consumed too many bytes : * too bad, the marks is invalid and process fails now with an IOException */ - bufferedStream.reset(); + markableStream.reset(); if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException && (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) { @@ -309,7 +320,7 @@ public final class TextParser { * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/ gzipParser gzParser = (gzipParser)parser; - nonCloseInputStream = new CloseShieldInputStream(bufferedStream); + nonCloseInputStream = new CloseShieldInputStream(markableStream); Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser); @@ -323,7 +334,7 @@ public final class TextParser { return new Document[] { maindoc }; } catch(Exception e1) { /* Try again to reset the marked stream if the failed parser has not consumed too many bytes */ - bufferedStream.reset(); + markableStream.reset(); } } }