Support parsing gzip files from servers with redundant headers.

Some web servers provide both 'Content-Encoding : "gzip"' and 'Content-Type : "application/x-gzip"' HTTP headers on their ".gz" files. This was annoying to fail on such resources which are not so uncommon, while non conforming (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)
8 years ago · 5a646540cc
parent 11a7f923d4
commit 5a646540cc
3 changed files with 166 additions and 50 deletions
--- a/source/net/yacy/crawler/retrieval/StreamResponse.java
+++ b/source/net/yacy/crawler/retrieval/StreamResponse.java
@ -30,7 +30,6 @@ import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 /**
 * A crawler load response, holding content as a stream.
@ -90,31 +89,7 @@ public class StreamResponse {
 	 *             when no parser support the content
 	 */
 	public Document[] parse() throws Parser.Failure {
-		final String supportError = TextParser.supports(this.response.url(),
+		return parseWithLimits(Integer.MAX_VALUE, Long.MAX_VALUE);
 				this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
 		if (supportError != null) {
 			throw new Parser.Failure("no parser support:" + supportError, this.response.url());
 		}
 		try {
 			return TextParser.parseSource(this.response.url(),
 					this.response.getResponseHeader() == null ? null
 							: this.response.getResponseHeader().getContentType(),
 					this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
 							: this.response.getResponseHeader().getCharacterEncoding(),
 					new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
 					this.response.getRequest().depth(), this.response.size(), this.contentStream);
 		} catch (final Exception e) {
 			return null;
 		} finally {
 			if (this.contentStream != null) {
 				try {
 					this.contentStream.close();
 				} catch (IOException ignored) {
 					log.warn("Could not close content stream on url " + this.response.url());
 				}
 			}
 		}
 	}
 	/**
@ -151,9 +126,11 @@ public class StreamResponse {
 					: this.response.getResponseHeader().getCharacterEncoding();
 			return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
-					this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks,
+						this.response.getRequest().timezoneOffset(), this.response.getRequest().depth(),
-					maxBytes);
+						this.response.size(), this.contentStream, maxLinks, maxBytes);
-		} catch (final Exception e) {
+		} catch(Parser.Failure e) {
 			throw e;
 		}catch (final Exception e) {
 			return null;
 		} finally {
 			if (this.contentStream != null) {
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -49,6 +49,7 @@ import net.yacy.document.parser.csvParser;
 import net.yacy.document.parser.docParser;
 import net.yacy.document.parser.genericParser;
 import net.yacy.document.parser.gzipParser;
 import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.linkScraperParser;
 import net.yacy.document.parser.mmParser;
@ -296,6 +297,35 @@ public final class TextParser {
 						/* Try to reset the marked stream. If the failed parser has consumed too many bytes : 
 						 * too bad, the marks is invalid and process fails now with an IOException */
 						bufferedStream.reset();
 						if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException 
 								&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
 							/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
 							 * let's have a chance to parse the stream as uncompressed. */
 							 /* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip", 
 							  * and "Content-type" with value such as "application/gzip".
 							 * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
 							 * that's why the gzipparser fails opening the stream. 
 							 * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
 							gzipParser gzParser = (gzipParser)parser; 
 							nonCloseInputStream = new CloseShieldInputStream(bufferedStream);
 							Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
 							try {
 								Document[] docs = gzParser.parseCompressedInputStream(location,
 										charset, timezoneOffset, depth,
 										nonCloseInputStream, maxLinks, maxBytes);
 								if (docs != null) {
 									maindoc.addSubDocuments(docs);
 								}
 								return new Document[] { maindoc };
 							} catch(Exception e1) {
 								/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
 								bufferedStream.reset();
 							}
 						}
 					}
 				}
 			} catch (IOException e) {
@ -345,6 +375,7 @@ public final class TextParser {
     * @param mimeType the mime type of the source, if known
     * @param charset the charset name of the source, if known
     * @param timezoneOffset the local time zone offset
     * @param depth the current depth of the crawl
     * @param contentLength the length of the source, if known (else -1 should be used)
     * @param source a input stream
     * @param maxLinks the maximum total number of links to parse and add to the result documents
@ -353,9 +384,9 @@ public final class TextParser {
     * @throws Parser.Failure when the parser processing failed
     */
 	public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
-			final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks,
+			final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
 			long maxBytes) throws Parser.Failure{
-		return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength,
+		return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, depth, contentLength,
 				sourceStream, maxLinks, maxBytes);
 	}
@ -400,6 +431,8 @@ public final class TextParser {
            	docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
            }
            return docs;
        } catch(Parser.Failure e) {
        	throw e;
        } catch (final Exception e) {
            throw new Parser.Failure("parser failed: " + parser.getName(), location);
        }
@ -460,8 +493,38 @@ public final class TextParser {
                		docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
                	}
                } catch (final Parser.Failure e) {
-                    failedParser.put(parser, e);
+					if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException && 
-                    //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
+							(parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) {
 						/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
 						 * let's have a chance to parse the stream as uncompressed. */
 						 /* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip", 
 						  * and "Content-type" with value such as "application/gzip".
 						 * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
 						 * that's why the gzipparser fails opening the stream. 
 						 * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
 						gzipParser gzParser = (gzipParser)parser;
 						bis = new ByteArrayInputStream(sourceArray);
 						Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
 						try {
 							docs = gzParser.parseCompressedInputStream(location,
 									charset, timezoneOffset, depth,
 									bis, maxLinks, maxBytes);
 							if (docs != null) {
 								maindoc.addSubDocuments(docs);
 							}
 							docs = new Document[] { maindoc };
 							break;
 						} catch(Parser.Failure e1) {
 							failedParser.put(parser, e1);
 						} catch(Exception e2) {
 							failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
 						}
 					} else {
 						failedParser.put(parser, e);
 					}
                } catch (final Exception e) {
                    failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
                    //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
@ -638,8 +701,21 @@ public final class TextParser {
        return ext2mime.get(ext.toLowerCase(Locale.ROOT));
    }
-    private static String normalizeMimeType(String mimeType) {
+	/**
-        if (mimeType == null) return "application/octet-stream";
+	 * Normalize a media type information string (can be a HTTP "Content-Type"
 	 * response header) : convert to lower case, remove any supplementary
 	 * parameters such as the encoding (charset name), and provide a default
 	 * value when null.
 	 * 
 	 * @param mimeType
 	 *            raw information about media type, eventually provided by a
 	 *            HTTP "Content-Type" response header
 	 * @return a non null media type in lower case
 	 */
    public static String normalizeMimeType(String mimeType) {
        if (mimeType == null) {
        	return "application/octet-stream";
        }
        mimeType = mimeType.toLowerCase(Locale.ROOT);
        final int pos = mimeType.indexOf(';');
        return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@ -31,9 +31,12 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.util.Date;
 import java.util.zip.GZIPInputStream;
 import org.apache.commons.compress.compressors.gzip.GzipUtils;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.document.AbstractParser;
@ -42,13 +45,14 @@ import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;
 import org.apache.commons.compress.compressors.gzip.GzipUtils;
 /**
 * Parses a gz archive.
 * Unzips and parses the content and adds it to the created main document
 */
 public class gzipParser extends AbstractParser implements Parser {
 	private static final int DEFAULT_DEPTH = 999;
    public gzipParser() {
        super("GNU Zip Compressed Archive Parser");
@ -75,12 +79,18 @@ public class gzipParser extends AbstractParser implements Parser {
        Document maindoc = null;
        GZIPInputStream zippedContent = null;
        FileOutputStream out = null;
        try {
            zippedContent = new GZIPInputStream(source);
        } catch(IOException e) {
        	/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
        	 * and eventually apply special error handling */
 			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
 					new GZIPOpeningStreamException());
        }
        try {
            int read = 0;
            final byte[] data = new byte[1024];
            zippedContent = new GZIPInputStream(source);
            tempFile = File.createTempFile("gunzip","tmp");
            // creating a temp file to store the uncompressed data
@ -112,11 +122,11 @@ public class gzipParser extends AbstractParser implements Parser {
        	}
        }
        try {
-            maindoc = createMainDocument(location, mimeType, charset);
+            maindoc = createMainDocument(location, mimeType, charset, this);
            // creating a new parser class to parse the unzipped content
            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-            Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
+            Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -134,15 +144,16 @@ public class gzipParser extends AbstractParser implements Parser {
     * @param location the parsed resource URL
     * @param mimeType the media type of the resource
     * @param charset the charset name if known
     * @param an instance of gzipParser that is registered as the parser origin of the document
     * @return a Document instance
     */
-	private Document createMainDocument(final DigestURL location, final String mimeType, final String charset) {
+	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
 		final String filename = location.getFileName();
 		Document maindoc = new Document(
 		        location,
 		        mimeType,
 		        charset,
-		        this,
+		        parser,
 		        null,
 		        null,
 		        AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
@ -159,6 +170,41 @@ public class gzipParser extends AbstractParser implements Parser {
 		        new Date());
 		return maindoc;
 	}
 	/**
 	 * Parse content in an open stream uncompressing on the fly a gzipped resource.
 	 * @param location the URL of the gzipped resource 
 	 * @param charset the charset name if known
 	 * @param timezoneOffset the local time zone offset
 	 * @param compressedInStream an open stream uncompressing on the fly the compressed content
 	 * @param maxLinks
 	 *            the maximum total number of links to parse and add to the
 	 *            result documents
 	 * @param maxBytes
 	 *            the maximum number of content bytes to process
 	 * @return a list of documents that result from parsing the source, with
 	 *         empty or null text.
 	 * @throws Parser.Failure
 	 *             when the parser processing failed
 	 */
 	public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
 			final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
        // creating a new parser class to parse the unzipped content
 		final String compressedFileName = location.getFileName();
        final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName);
        final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
        try {
        	/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
    		final String locationPath = location.getPath();
        	final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
 			final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
 	        /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
 	        return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
 		} catch (MalformedURLException e) {
 			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
 		}
 	}
    @Override
    public boolean isParseWithLimitsSupported() {
@ -177,21 +223,38 @@ public class gzipParser extends AbstractParser implements Parser {
        	 * before an eventual OutOfMemory occurs */
            zippedContent = new GZIPInputStream(source);
        } catch(IOException e) {
-        	throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
+        	/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
        	 * and eventually apply special error handling */
 			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
 					new GZIPOpeningStreamException());
        }
        try {
-            maindoc = createMainDocument(location, mimeType, charset);
+            maindoc = createMainDocument(location, mimeType, charset, this);
            // creating a new parser class to parse the unzipped content
            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-            /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
+            Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
-            Document[] docs = TextParser.parseWithLimits(location, mime, charset, timezoneOffset, -1, zippedContent, maxLinks, maxBytes);
+            if (docs != null) {
-            if (docs != null) maindoc.addSubDocuments(docs);
+            	maindoc.addSubDocuments(docs);
            }
        } catch (final Exception e) {
            throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
        }
        return maindoc == null ? null : new Document[]{maindoc};
    }
    /**
     * Used to signal an error occurred when opening a gzipped input stream.
     */
    public class GZIPOpeningStreamException extends Exception {
 		/** The serialization ID */
 		private static final long serialVersionUID = 2824038185373304636L;
 		public GZIPOpeningStreamException() {
    		super();
    	}
    	public GZIPOpeningStreamException(final String message) {
    		super(message);
    	}
    }
 }