From 5a646540ccecd7e4d1386902b89e2348493d05ce Mon Sep 17 00:00:00 2001 From: luccioman Date: Sun, 16 Jul 2017 14:46:46 +0200 Subject: [PATCH] Support parsing gzip files from servers with redundant headers. Some web servers provide both 'Content-Encoding : "gzip"' and 'Content-Type : "application/x-gzip"' HTTP headers on their ".gz" files. This was annoying to fail on such resources which are not so uncommon, while non conforming (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2) --- .../crawler/retrieval/StreamResponse.java | 35 ++----- source/net/yacy/document/TextParser.java | 88 ++++++++++++++++-- .../net/yacy/document/parser/gzipParser.java | 93 ++++++++++++++++--- 3 files changed, 166 insertions(+), 50 deletions(-) diff --git a/source/net/yacy/crawler/retrieval/StreamResponse.java b/source/net/yacy/crawler/retrieval/StreamResponse.java index 7f99b0135..d52614e4f 100644 --- a/source/net/yacy/crawler/retrieval/StreamResponse.java +++ b/source/net/yacy/crawler/retrieval/StreamResponse.java @@ -30,7 +30,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; -import net.yacy.document.VocabularyScraper; /** * A crawler load response, holding content as a stream. @@ -90,31 +89,7 @@ public class StreamResponse { * when no parser support the content */ public Document[] parse() throws Parser.Failure { - final String supportError = TextParser.supports(this.response.url(), - this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType()); - if (supportError != null) { - throw new Parser.Failure("no parser support:" + supportError, this.response.url()); - } - try { - return TextParser.parseSource(this.response.url(), - this.response.getResponseHeader() == null ? null - : this.response.getResponseHeader().getContentType(), - this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name() - : this.response.getResponseHeader().getCharacterEncoding(), - new VocabularyScraper(), this.response.getRequest().timezoneOffset(), - this.response.getRequest().depth(), this.response.size(), this.contentStream); - } catch (final Exception e) { - return null; - } finally { - if (this.contentStream != null) { - try { - this.contentStream.close(); - } catch (IOException ignored) { - log.warn("Could not close content stream on url " + this.response.url()); - } - } - } - + return parseWithLimits(Integer.MAX_VALUE, Long.MAX_VALUE); } /** @@ -151,9 +126,11 @@ public class StreamResponse { : this.response.getResponseHeader().getCharacterEncoding(); return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName, - this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks, - maxBytes); - } catch (final Exception e) { + this.response.getRequest().timezoneOffset(), this.response.getRequest().depth(), + this.response.size(), this.contentStream, maxLinks, maxBytes); + } catch(Parser.Failure e) { + throw e; + }catch (final Exception e) { return null; } finally { if (this.contentStream != null) { diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index bb6c6fb2b..a2126cb4f 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -49,6 +49,7 @@ import net.yacy.document.parser.csvParser; import net.yacy.document.parser.docParser; import net.yacy.document.parser.genericParser; import net.yacy.document.parser.gzipParser; +import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.linkScraperParser; import net.yacy.document.parser.mmParser; @@ -296,6 +297,35 @@ public final class TextParser { /* Try to reset the marked stream. If the failed parser has consumed too many bytes : * too bad, the marks is invalid and process fails now with an IOException */ bufferedStream.reset(); + + if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException + && (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) { + /* The gzip parser failed directly when opening the content stream : before falling back to the generic parser, + * let's have a chance to parse the stream as uncompressed. */ + /* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip", + * and "Content-type" with value such as "application/gzip". + * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly, + * that's why the gzipparser fails opening the stream. + * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/ + gzipParser gzParser = (gzipParser)parser; + + nonCloseInputStream = new CloseShieldInputStream(bufferedStream); + + Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser); + + try { + Document[] docs = gzParser.parseCompressedInputStream(location, + charset, timezoneOffset, depth, + nonCloseInputStream, maxLinks, maxBytes); + if (docs != null) { + maindoc.addSubDocuments(docs); + } + return new Document[] { maindoc }; + } catch(Exception e1) { + /* Try again to reset the marked stream if the failed parser has not consumed too many bytes */ + bufferedStream.reset(); + } + } } } } catch (IOException e) { @@ -345,6 +375,7 @@ public final class TextParser { * @param mimeType the mime type of the source, if known * @param charset the charset name of the source, if known * @param timezoneOffset the local time zone offset + * @param depth the current depth of the crawl * @param contentLength the length of the source, if known (else -1 should be used) * @param source a input stream * @param maxLinks the maximum total number of links to parse and add to the result documents @@ -353,9 +384,9 @@ public final class TextParser { * @throws Parser.Failure when the parser processing failed */ public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, - final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks, + final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks, long maxBytes) throws Parser.Failure{ - return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength, + return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, depth, contentLength, sourceStream, maxLinks, maxBytes); } @@ -400,6 +431,8 @@ public final class TextParser { docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource); } return docs; + } catch(Parser.Failure e) { + throw e; } catch (final Exception e) { throw new Parser.Failure("parser failed: " + parser.getName(), location); } @@ -460,8 +493,38 @@ public final class TextParser { docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis); } } catch (final Parser.Failure e) { - failedParser.put(parser, e); - //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); + if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException && + (parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) { + /* The gzip parser failed directly when opening the content stream : before falling back to the generic parser, + * let's have a chance to parse the stream as uncompressed. */ + /* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip", + * and "Content-type" with value such as "application/gzip". + * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly, + * that's why the gzipparser fails opening the stream. + * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/ + gzipParser gzParser = (gzipParser)parser; + + bis = new ByteArrayInputStream(sourceArray); + + Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser); + + try { + docs = gzParser.parseCompressedInputStream(location, + charset, timezoneOffset, depth, + bis, maxLinks, maxBytes); + if (docs != null) { + maindoc.addSubDocuments(docs); + } + docs = new Document[] { maindoc }; + break; + } catch(Parser.Failure e1) { + failedParser.put(parser, e1); + } catch(Exception e2) { + failedParser.put(parser, new Parser.Failure(e2.getMessage(), location)); + } + } else { + failedParser.put(parser, e); + } } catch (final Exception e) { failedParser.put(parser, new Parser.Failure(e.getMessage(), location)); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); @@ -638,8 +701,21 @@ public final class TextParser { return ext2mime.get(ext.toLowerCase(Locale.ROOT)); } - private static String normalizeMimeType(String mimeType) { - if (mimeType == null) return "application/octet-stream"; + /** + * Normalize a media type information string (can be a HTTP "Content-Type" + * response header) : convert to lower case, remove any supplementary + * parameters such as the encoding (charset name), and provide a default + * value when null. + * + * @param mimeType + * raw information about media type, eventually provided by a + * HTTP "Content-Type" response header + * @return a non null media type in lower case + */ + public static String normalizeMimeType(String mimeType) { + if (mimeType == null) { + return "application/octet-stream"; + } mimeType = mimeType.toLowerCase(Locale.ROOT); final int pos = mimeType.indexOf(';'); return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim()); diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 2ffc2270f..f1a43e0a2 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -31,9 +31,12 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.net.MalformedURLException; import java.util.Date; import java.util.zip.GZIPInputStream; +import org.apache.commons.compress.compressors.gzip.GzipUtils; + import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; @@ -42,13 +45,14 @@ import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; -import org.apache.commons.compress.compressors.gzip.GzipUtils; /** * Parses a gz archive. * Unzips and parses the content and adds it to the created main document */ public class gzipParser extends AbstractParser implements Parser { + + private static final int DEFAULT_DEPTH = 999; public gzipParser() { super("GNU Zip Compressed Archive Parser"); @@ -75,12 +79,18 @@ public class gzipParser extends AbstractParser implements Parser { Document maindoc = null; GZIPInputStream zippedContent = null; FileOutputStream out = null; + try { + zippedContent = new GZIPInputStream(source); + } catch(IOException e) { + /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening + * and eventually apply special error handling */ + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location, + new GZIPOpeningStreamException()); + } try { int read = 0; final byte[] data = new byte[1024]; - zippedContent = new GZIPInputStream(source); - tempFile = File.createTempFile("gunzip","tmp"); // creating a temp file to store the uncompressed data @@ -112,11 +122,11 @@ public class gzipParser extends AbstractParser implements Parser { } } try { - maindoc = createMainDocument(location, mimeType, charset); + maindoc = createMainDocument(location, mimeType, charset, this); // creating a new parser class to parse the unzipped content final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName()); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); - Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); + Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile); if (docs != null) maindoc.addSubDocuments(docs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; @@ -134,15 +144,16 @@ public class gzipParser extends AbstractParser implements Parser { * @param location the parsed resource URL * @param mimeType the media type of the resource * @param charset the charset name if known + * @param an instance of gzipParser that is registered as the parser origin of the document * @return a Document instance */ - private Document createMainDocument(final DigestURL location, final String mimeType, final String charset) { + public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) { final String filename = location.getFileName(); Document maindoc = new Document( location, mimeType, charset, - this, + parser, null, null, AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title @@ -159,6 +170,41 @@ public class gzipParser extends AbstractParser implements Parser { new Date()); return maindoc; } + + /** + * Parse content in an open stream uncompressing on the fly a gzipped resource. + * @param location the URL of the gzipped resource + * @param charset the charset name if known + * @param timezoneOffset the local time zone offset + * @param compressedInStream an open stream uncompressing on the fly the compressed content + * @param maxLinks + * the maximum total number of links to parse and add to the + * result documents + * @param maxBytes + * the maximum number of content bytes to process + * @return a list of documents that result from parsing the source, with + * empty or null text. + * @throws Parser.Failure + * when the parser processing failed + */ + public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth, + final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { + // creating a new parser class to parse the unzipped content + final String compressedFileName = location.getFileName(); + final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName); + final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); + try { + /* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */ + final String locationPath = location.getPath(); + final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename; + final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath); + + /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */ + return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes); + } catch (MalformedURLException e) { + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); + } + } @Override public boolean isParseWithLimitsSupported() { @@ -177,21 +223,38 @@ public class gzipParser extends AbstractParser implements Parser { * before an eventual OutOfMemory occurs */ zippedContent = new GZIPInputStream(source); } catch(IOException e) { - throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); + /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening + * and eventually apply special error handling */ + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location, + new GZIPOpeningStreamException()); } try { - maindoc = createMainDocument(location, mimeType, charset); - // creating a new parser class to parse the unzipped content - final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName()); - final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); + maindoc = createMainDocument(location, mimeType, charset, this); - /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */ - Document[] docs = TextParser.parseWithLimits(location, mime, charset, timezoneOffset, -1, zippedContent, maxLinks, maxBytes); - if (docs != null) maindoc.addSubDocuments(docs); + Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes); + if (docs != null) { + maindoc.addSubDocuments(docs); + } } catch (final Exception e) { throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); } return maindoc == null ? null : new Document[]{maindoc}; } + /** + * Used to signal an error occurred when opening a gzipped input stream. + */ + public class GZIPOpeningStreamException extends Exception { + + /** The serialization ID */ + private static final long serialVersionUID = 2824038185373304636L; + + public GZIPOpeningStreamException() { + super(); + } + + public GZIPOpeningStreamException(final String message) { + super(message); + } + } }