Support parsing gzip files from servers with redundant headers.

Some web servers provide both 'Content-Encoding : "gzip"' and 'Content-Type : "application/x-gzip"' HTTP headers on their ".gz" files. This was annoying to fail on such resources which are not so uncommon, while non conforming (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)
8 years ago · 5a646540cc
parent 11a7f923d4
commit 5a646540cc
3 changed files with 166 additions and 50 deletions
--- a/source/net/yacy/crawler/retrieval/StreamResponse.java
+++ b/source/net/yacy/crawler/retrieval/StreamResponse.java
@ -30,7 +30,6 @@ import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
-import net.yacy.document.VocabularyScraper;

 /**
 * A crawler load response, holding content as a stream.
@ -90,31 +89,7 @@ public class StreamResponse {
 	 *             when no parser support the content
 	 */
 	public Document[] parse() throws Parser.Failure {
-		final String supportError = TextParser.supports(this.response.url(),
-				this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
-		if (supportError != null) {
-			throw new Parser.Failure("no parser support:" + supportError, this.response.url());
-		}
-		try {
-			return TextParser.parseSource(this.response.url(),
-					this.response.getResponseHeader() == null ? null
-							: this.response.getResponseHeader().getContentType(),
-					this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
-							: this.response.getResponseHeader().getCharacterEncoding(),
-					new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
-					this.response.getRequest().depth(), this.response.size(), this.contentStream);
-		} catch (final Exception e) {
-			return null;
-		} finally {
-			if (this.contentStream != null) {
-				try {
-					this.contentStream.close();
-				} catch (IOException ignored) {
-					log.warn("Could not close content stream on url " + this.response.url());
-				}
-			}
-		}
-
+		return parseWithLimits(Integer.MAX_VALUE, Long.MAX_VALUE);
 	}
 	
 	/**
@ -151,9 +126,11 @@ public class StreamResponse {
 					: this.response.getResponseHeader().getCharacterEncoding();
 			
 			return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
-					this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks,
-					maxBytes);
-		} catch (final Exception e) {
+						this.response.getRequest().timezoneOffset(), this.response.getRequest().depth(),
+						this.response.size(), this.contentStream, maxLinks, maxBytes);
+		} catch(Parser.Failure e) {
+			throw e;
+		}catch (final Exception e) {
 			return null;
 		} finally {
 			if (this.contentStream != null) {
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -49,6 +49,7 @@ import net.yacy.document.parser.csvParser;
 import net.yacy.document.parser.docParser;
 import net.yacy.document.parser.genericParser;
 import net.yacy.document.parser.gzipParser;
+import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.linkScraperParser;
 import net.yacy.document.parser.mmParser;
@ -296,6 +297,35 @@ public final class TextParser {
 						/* Try to reset the marked stream. If the failed parser has consumed too many bytes : 
 						 * too bad, the marks is invalid and process fails now with an IOException */
 						bufferedStream.reset();
+						
+						if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException 
+								&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
+							/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
+							 * let's have a chance to parse the stream as uncompressed. */
+							 /* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip", 
+							  * and "Content-type" with value such as "application/gzip".
+							 * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
+							 * that's why the gzipparser fails opening the stream. 
+							 * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
+							gzipParser gzParser = (gzipParser)parser; 
+						
+							nonCloseInputStream = new CloseShieldInputStream(bufferedStream);
+							
+							Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
+
+							try {
+								Document[] docs = gzParser.parseCompressedInputStream(location,
+										charset, timezoneOffset, depth,
+										nonCloseInputStream, maxLinks, maxBytes);
+								if (docs != null) {
+									maindoc.addSubDocuments(docs);
+								}
+								return new Document[] { maindoc };
+							} catch(Exception e1) {
+								/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
+								bufferedStream.reset();
+							}
+						}
 					}
 				}
 			} catch (IOException e) {
@ -345,6 +375,7 @@ public final class TextParser {
     * @param mimeType the mime type of the source, if known
     * @param charset the charset name of the source, if known
     * @param timezoneOffset the local time zone offset
+     * @param depth the current depth of the crawl
     * @param contentLength the length of the source, if known (else -1 should be used)
     * @param source a input stream
     * @param maxLinks the maximum total number of links to parse and add to the result documents
@ -353,9 +384,9 @@ public final class TextParser {
     * @throws Parser.Failure when the parser processing failed
     */
 	public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
-			final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks,
+			final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
 			long maxBytes) throws Parser.Failure{
-		return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength,
+		return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, depth, contentLength,
 				sourceStream, maxLinks, maxBytes);
 	}
    
@ -400,6 +431,8 @@ public final class TextParser {
            	docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
            }
            return docs;
+        } catch(Parser.Failure e) {
+        	throw e;
        } catch (final Exception e) {
            throw new Parser.Failure("parser failed: " + parser.getName(), location);
        }
@ -460,8 +493,38 @@ public final class TextParser {
                		docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
                	}
                } catch (final Parser.Failure e) {
-                    failedParser.put(parser, e);
-                    //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
+					if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException && 
+							(parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) {
+						/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
+						 * let's have a chance to parse the stream as uncompressed. */
+						 /* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip", 
+						  * and "Content-type" with value such as "application/gzip".
+						 * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
+						 * that's why the gzipparser fails opening the stream. 
+						 * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
+						gzipParser gzParser = (gzipParser)parser;
+						
+						bis = new ByteArrayInputStream(sourceArray);
+					
+						Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
+
+						try {
+							docs = gzParser.parseCompressedInputStream(location,
+									charset, timezoneOffset, depth,
+									bis, maxLinks, maxBytes);
+							if (docs != null) {
+								maindoc.addSubDocuments(docs);
+							}
+							docs = new Document[] { maindoc };
+							break;
+						} catch(Parser.Failure e1) {
+							failedParser.put(parser, e1);
+						} catch(Exception e2) {
+							failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
+						}
+					} else {
+						failedParser.put(parser, e);
+					}
                } catch (final Exception e) {
                    failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
                    //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
@ -638,8 +701,21 @@ public final class TextParser {
        return ext2mime.get(ext.toLowerCase(Locale.ROOT));
    }

-    private static String normalizeMimeType(String mimeType) {
-        if (mimeType == null) return "application/octet-stream";
+	/**
+	 * Normalize a media type information string (can be a HTTP "Content-Type"
+	 * response header) : convert to lower case, remove any supplementary
+	 * parameters such as the encoding (charset name), and provide a default
+	 * value when null.
+	 * 
+	 * @param mimeType
+	 *            raw information about media type, eventually provided by a
+	 *            HTTP "Content-Type" response header
+	 * @return a non null media type in lower case
+	 */
+    public static String normalizeMimeType(String mimeType) {
+        if (mimeType == null) {
+        	return "application/octet-stream";
+        }
        mimeType = mimeType.toLowerCase(Locale.ROOT);
        final int pos = mimeType.indexOf(';');
        return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@ -31,9 +31,12 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.MalformedURLException;
 import java.util.Date;
 import java.util.zip.GZIPInputStream;

+import org.apache.commons.compress.compressors.gzip.GzipUtils;
+
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.document.AbstractParser;
@ -42,13 +45,14 @@ import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;
-import org.apache.commons.compress.compressors.gzip.GzipUtils;

 /**
 * Parses a gz archive.
 * Unzips and parses the content and adds it to the created main document
 */
 public class gzipParser extends AbstractParser implements Parser {
+	
+	private static final int DEFAULT_DEPTH = 999;

    public gzipParser() {
        super("GNU Zip Compressed Archive Parser");
@ -75,12 +79,18 @@ public class gzipParser extends AbstractParser implements Parser {
        Document maindoc = null;
        GZIPInputStream zippedContent = null;
        FileOutputStream out = null;
+        try {
+            zippedContent = new GZIPInputStream(source);
+        } catch(IOException e) {
+        	/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
+        	 * and eventually apply special error handling */
+			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
+					new GZIPOpeningStreamException());
+        }
        try {
            int read = 0;
            final byte[] data = new byte[1024];

-            zippedContent = new GZIPInputStream(source);
-
            tempFile = File.createTempFile("gunzip","tmp");

            // creating a temp file to store the uncompressed data
@ -112,11 +122,11 @@ public class gzipParser extends AbstractParser implements Parser {
        	}
        }
        try {
-            maindoc = createMainDocument(location, mimeType, charset);
+            maindoc = createMainDocument(location, mimeType, charset, this);
            // creating a new parser class to parse the unzipped content
            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-            Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
+            Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -134,15 +144,16 @@ public class gzipParser extends AbstractParser implements Parser {
     * @param location the parsed resource URL
     * @param mimeType the media type of the resource
     * @param charset the charset name if known
+     * @param an instance of gzipParser that is registered as the parser origin of the document
     * @return a Document instance
     */
-	private Document createMainDocument(final DigestURL location, final String mimeType, final String charset) {
+	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
 		final String filename = location.getFileName();
 		Document maindoc = new Document(
 		        location,
 		        mimeType,
 		        charset,
-		        this,
+		        parser,
 		        null,
 		        null,
 		        AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
@ -159,6 +170,41 @@ public class gzipParser extends AbstractParser implements Parser {
 		        new Date());
 		return maindoc;
 	}
+	
+	/**
+	 * Parse content in an open stream uncompressing on the fly a gzipped resource.
+	 * @param location the URL of the gzipped resource 
+	 * @param charset the charset name if known
+	 * @param timezoneOffset the local time zone offset
+	 * @param compressedInStream an open stream uncompressing on the fly the compressed content
+	 * @param maxLinks
+	 *            the maximum total number of links to parse and add to the
+	 *            result documents
+	 * @param maxBytes
+	 *            the maximum number of content bytes to process
+	 * @return a list of documents that result from parsing the source, with
+	 *         empty or null text.
+	 * @throws Parser.Failure
+	 *             when the parser processing failed
+	 */
+	public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
+			final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
+        // creating a new parser class to parse the unzipped content
+		final String compressedFileName = location.getFileName();
+        final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName);
+        final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
+        try {
+        	/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
+    		final String locationPath = location.getPath();
+        	final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
+			final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
+			
+	        /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
+	        return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
+		} catch (MalformedURLException e) {
+			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
+		}
+	}
    
    @Override
    public boolean isParseWithLimitsSupported() {
@ -177,21 +223,38 @@ public class gzipParser extends AbstractParser implements Parser {
        	 * before an eventual OutOfMemory occurs */
            zippedContent = new GZIPInputStream(source);
        } catch(IOException e) {
-        	throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
+        	/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
+        	 * and eventually apply special error handling */
+			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
+					new GZIPOpeningStreamException());
        }
        try {
-            maindoc = createMainDocument(location, mimeType, charset);
-            // creating a new parser class to parse the unzipped content
-            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
-            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
+            maindoc = createMainDocument(location, mimeType, charset, this);
            
-            /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
-            Document[] docs = TextParser.parseWithLimits(location, mime, charset, timezoneOffset, -1, zippedContent, maxLinks, maxBytes);
-            if (docs != null) maindoc.addSubDocuments(docs);
+            Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
+            if (docs != null) {
+            	maindoc.addSubDocuments(docs);
+            }
        } catch (final Exception e) {
            throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
        }
        return maindoc == null ? null : new Document[]{maindoc};
    }

+    /**
+     * Used to signal an error occurred when opening a gzipped input stream.
+     */
+    public class GZIPOpeningStreamException extends Exception {
+
+		/** The serialization ID */
+		private static final long serialVersionUID = 2824038185373304636L;
+
+		public GZIPOpeningStreamException() {
+    		super();
+    	}
+    	
+    	public GZIPOpeningStreamException(final String message) {
+    		super(message);
+    	}
+    }
 }