From e0f400a0bdfbd07cbf124e2cbfc25bee3281a21b Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Tue, 11 Jul 2017 09:06:37 +0200
Subject: [PATCH] Support trying multiple parsers even when streaming on large
 resources.

---
 source/net/yacy/document/TextParser.java | 77 +++++++++++++++---------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java
index 5a1e6b898..f3dcd89c5 100644
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@@ -35,6 +35,7 @@ import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.commons.fileupload.util.LimitedInputStream;
+import org.apache.commons.io.input.CloseShieldInputStream;
 
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.DigestURL;
@@ -260,53 +261,65 @@ public final class TextParser {
         assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
 
         boolean canStream = false;
-        Parser streamParser = idioms.iterator().next();
         if(idioms.size() == 1) {
         	canStream = true;
         } else if(idioms.size() == 2) {
-        	/* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one*/
+        	/* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one */
         	for(Parser idiom : idioms) {
         		if(idiom instanceof genericParser) {
         			canStream = true;
-        		} else {
-        			/* stream oriented parsing will be performed by the non generic parser */
-        			streamParser = idiom;
-        		}
-        	}
-        } else if(idioms.size() > 2) {
-			/* Prefer the first available non generic parser */
-        	for(Parser idiom : idioms) {
-        		if(!(idiom instanceof genericParser)) {
-        			streamParser = idiom;
-        			break;
         		}
         	}
         }
         
         // if we do not have more than one non generic parser or the content size is over MaxInt (2GB) or is over the totally available memory
-        // then we use only one stream-oriented parser.
-        if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) {
-            // use a specific stream-oriented parser
-            return parseSource(location, mimeType, streamParser, charset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
-        }
+        // then we use only stream-oriented parser.
+		if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) {
+			try {
+				/* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource
+				 * and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */
+				int rewindSize = 10 * 1024;
+				final BufferedInputStream bufferedStream = new BufferedInputStream(sourceStream, rewindSize);
+				/* Mark now to allow resetting the buffered stream to the beginning of the stream */
+				bufferedStream.mark(rewindSize);
+				
+				/* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
+				for(Parser parser : idioms) {
+					/* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream 
+					 * and so let us eventually reuse the same opened stream with other parsers on parser failure */
+					CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(bufferedStream);
+					
+					try {
+						return parseSource(location, mimeType, parser, charset, scraper, timezoneOffset,
+								nonCloseInputStream, maxLinks, maxBytes);
+					} catch (Parser.Failure e) {
+						/* Try to reset the marked stream. If the failed parser has consumed too many bytes : 
+						 * too bad, the marks is invalid and process fails now with an IOException */
+						bufferedStream.reset();
+					}
+				}
+			} catch (IOException e) {
+				throw new Parser.Failure("Error reading source", location);
+			}
+		}
 
         // in case that we know more parsers we first transform the content into a byte[] and use that as base
         // for a number of different parse attempts.
+		
+		int maxBytesToRead = -1;
+		if(maxBytes < Integer.MAX_VALUE) {
+			/* Load at most maxBytes + 1 :
+		       - to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure
+		       - but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
+			maxBytesToRead = (int)maxBytes + 1;
+		}
+		if(contentLength >= 0 && contentLength < maxBytesToRead) {
+			maxBytesToRead = (int)contentLength;
+		}
         
-        /* Content length may be known from headers : check it now */
-        if(contentLength >= 0 && contentLength > maxBytes) {
-        	throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
-        }
         byte[] b = null;
         try {
-            b = FileUtils.read(sourceStream, (int) contentLength);
-            
-            /* Check content size now if contentLength was unknown */
-            if(contentLength < 0) {
-            	if(b.length > maxBytes) {
-            		throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);		
-            	}
-            }
+            b = FileUtils.read(sourceStream, maxBytesToRead);
         } catch (final IOException e) {
             throw new Parser.Failure(e.getMessage(), location);
         }
@@ -447,6 +460,10 @@ public final class TextParser {
                 	if(parser.isParseWithLimitsSupported()) {
                 		docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, bis, maxLinks, maxBytes);
                 	} else {
+                        /* Partial parsing is not supported by this parser : check content length now */
+                       	if(sourceArray.length > maxBytes) {
+                       		throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);		
+                       	}
                 		docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
                 	}
                 } catch (final Parser.Failure e) {