Added a parser for XZ compressed archives.

As suggested by LA_FORGE on mantis 781 (http://mantis.tokeek.de/view.php?id=781)
7 years ago · 685122363d
parent 8ce9c066bf
commit 685122363d
12 changed files with 545 additions and 0 deletions
--- a/.classpath
+++ b/.classpath
@ -23,6 +23,7 @@
 	<classpathentry kind="lib" path="lib/json-simple-1.1.1.jar"/>
 	<classpathentry kind="lib" path="lib/xml-apis.jar"/>
 	<classpathentry kind="lib" path="lib/commons-compress-1.17.jar"/>
+	<classpathentry kind="lib" path="lib/xz-1.8.jar"/>
 	<classpathentry kind="lib" path="lib/commons-lang-2.6.jar"/>
 	<classpathentry kind="lib" path="lib/commons-codec-1.11.jar"/>
 	<classpathentry kind="lib" path="lib/jcl-over-slf4j-1.7.25.jar"/>
--- a/build.xml
+++ b/build.xml
@ -179,6 +179,7 @@
    	<pathelement location="${lib}/common-lang-3.3.2.jar" />
    	<pathelement location="${lib}/commons-codec-1.11.jar" />
    	<pathelement location="${lib}/commons-compress-1.17.jar" />
+    	<pathelement location="${lib}/xz-1.8.jar" />
    	<pathelement location="${lib}/commons-fileupload-1.3.3.jar" />
    	<pathelement location="${lib}/commons-io-2.6.jar" />
    	<pathelement location="${lib}/commons-jxpath-1.3.jar" />
--- a/lib/xz-1.8.License
+++ b/lib/xz-1.8.License
@ -0,0 +1,10 @@
+
+Licensing of XZ for Java
+========================
+
+    All the files in this package have been written by Lasse Collin
+    and/or Igor Pavlov. All these files have been put into the
+    public domain. You can do whatever you want with these files.
+
+    This software is provided "as is", without any warranty.
+
--- a/lib/xz-1.8.jar
+++ b/lib/xz-1.8.jar
--- a/pom.xml
+++ b/pom.xml
@ -380,6 +380,12 @@
            <version>1.17</version>
            <type>jar</type>
        </dependency>
+		<dependency>
+			<!-- Handle XZ compressed archives. It is an optional dependency of commons-compress. -->
+  			<groupId>org.tukaani</groupId>
+  			<artifactId>xz</artifactId>
+  			<version>1.8</version>
+		</dependency>
        <dependency>
            <groupId>commons-fileupload</groupId>
            <artifactId>commons-fileupload</artifactId>
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -42,6 +42,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.StrictLimitInputStream;
 import net.yacy.document.parser.GenericXMLParser;
+import net.yacy.document.parser.XZParser;
 import net.yacy.document.parser.apkParser;
 import net.yacy.document.parser.audioTagParser;
 import net.yacy.document.parser.bzipParser;
@ -93,6 +94,7 @@ public final class TextParser {
    static {
        initParser(new apkParser());
        initParser(new bzipParser());
+        initParser(new XZParser());
        initParser(new csvParser());
        initParser(new docParser());
        initParser(new gzipParser());
@ -380,6 +382,32 @@ public final class TextParser {
 				Integer.MAX_VALUE, Long.MAX_VALUE);
 	}
    
+    /**
+     * Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...) 
+     * or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
+     * (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
+	 * not support parsing within limits, an exception is thrown when
+	 * content size is beyond maxBytes.
+     * @param location the URL of the source
+     * @param mimeType the mime type of the source, if known
+     * @param charset the charset name of the source, if known
+     * @param ignoreClassNames an eventual set of CSS class names whose matching html elements content should be ignored
+     * @param timezoneOffset the local time zone offset
+     * @param depth the current depth of the crawl
+     * @param contentLength the length of the source, if known (else -1 should be used)
+     * @param source a input stream
+     * @param maxLinks the maximum total number of links to parse and add to the result documents
+     * @param maxBytes the maximum number of content bytes to process
+     * @return a list of documents that result from parsing the source, with empty or null text.
+     * @throws Parser.Failure when the parser processing failed
+     */
+	public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
+			final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
+			long maxBytes) throws Parser.Failure{
+		return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
+				sourceStream, maxLinks, maxBytes);
+	}
+	
    /**
     * Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...) 
     * or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
--- a/source/net/yacy/document/parser/AbstractCompressorParser.java
+++ b/source/net/yacy/document/parser/AbstractCompressorParser.java
@ -0,0 +1,187 @@
+// AbstractCompressorParser.java
+// ---------------------------
+// Copyright 2018 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.util.Date;
+import java.util.Set;
+
+import org.apache.commons.compress.compressors.CompressorInputStream;
+
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.document.AbstractParser;
+import net.yacy.document.Document;
+import net.yacy.document.Parser;
+import net.yacy.document.TextParser;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * Base class for parsing compressed files relying on Apache commons-compress
+ * tools.
+ */
+public abstract class AbstractCompressorParser extends AbstractParser implements Parser {
+
+	/** Crawl depth applied when parsing internal compressed content */
+	protected static final int DEFAULT_DEPTH = 999;
+
+	/**
+	 * @param name the human readable name of the parser
+	 */
+	public AbstractCompressorParser(final String name) {
+		super(name);
+	}
+
+	/**
+	 * @param source an open input stream on a compressed source
+	 * @return a sub class of CompressorInputStream capable of uncompressing the source
+	 *         on the fly
+	 * @throws IOException when an error occurred when trying to open the compressed
+	 *                     stream
+	 */
+	protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException;
+	
+    /**
+     * Maps the given name of a compressed file to the name that the
+     * file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned.
+     *
+     * @param filename name of a compressed file
+     * @return name of the corresponding uncompressed file
+     */
+    protected abstract String getUncompressedFilename(final String filename);
+
+	@Override
+	public Document[] parse(final DigestURL location, final String mimeType, final String charset,
+			final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
+			final InputStream source) throws Parser.Failure, InterruptedException {
+
+		return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
+				Long.MAX_VALUE);
+	}
+
+	@Override
+	public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
+			final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
+			final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure {
+		Document maindoc;
+		final CompressorInputStream compressedInStream;
+		try {
+			compressedInStream = createDecompressStream(source);
+		} catch (final IOException | RuntimeException e) {
+			throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
+		}
+
+		try {
+			// create maindoc for this archive, register with supplied url & mime
+			maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);
+
+			final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset,
+					AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
+			if (docs != null) {
+				maindoc.addSubDocuments(docs);
+				if (docs.length > 0 && docs[0].isPartiallyParsed()) {
+					maindoc.setPartiallyParsed(true);
+				}
+			}
+		} catch (final Parser.Failure e) {
+			throw e;
+		} catch (final IOException | RuntimeException e) {
+			throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
+		}
+		return new Document[] { maindoc };
+	}
+
+	/**
+	 * Create the main parsed document for the compressed document at the given URL
+	 * and Media type
+	 *
+	 * @param location the parsed resource URL
+	 * @param mimeType the media type of the resource
+	 * @param charset  the charset name if known
+	 * @param parser   an instance of CompressorParser that is registered as the
+	 *                 parser origin of the document
+	 * @return a Document instance
+	 */
+	protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
+			final AbstractCompressorParser parser) {
+		final String filename = location.getFileName();
+		return new Document(location, mimeType, charset, parser, null, null,
+				AbstractParser
+						.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
+				null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
+	}
+
+	/**
+	 * Parse content in an open stream uncompressing on the fly a compressed
+	 * resource.
+	 *
+	 * @param location           the URL of the compressed resource
+	 * @param charset            the charset name if known
+	 * @param ignoreClassNames   an eventual set of CSS class names whose matching
+	 *                           html elements content should be ignored
+	 * @param timezoneOffset     the local time zone offset
+	 * @param compressedInStream an open stream uncompressing on the fly the
+	 *                           compressed content
+	 * @param maxLinks           the maximum total number of links to parse and add
+	 *                           to the result documents
+	 * @param maxBytes           the maximum number of content bytes to process
+	 * @return a list of documents that result from parsing the source, with empty
+	 *         or null text.
+	 * @throws Parser.Failure when the parser processing failed
+	 */
+	protected Document[] parseCompressedInputStream(final DigestURL location, final String charset,
+			final Set<String> ignoreClassNames, final int timezoneOffset, final int depth,
+			final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
+		final String compressedFileName = location.getFileName();
+		final String contentfilename = getUncompressedFilename(compressedFileName);
+		final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
+		try {
+			/*
+			 * Use the uncompressed file name for sub parsers to not unnecessarily use again
+			 * this same uncompressing parser
+			 */
+			final String locationPath = location.getPath();
+			final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length())
+					+ contentfilename;
+			final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(),
+					location.getPort(), contentPath);
+
+			/*
+			 * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
+			 * compressed content
+			 */
+			return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth,
+					-1, compressedInStream, maxLinks, maxBytes);
+		} catch (final MalformedURLException e) {
+			throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
+		}
+	}
+
+	@Override
+	public boolean isParseWithLimitsSupported() {
+		return true;
+	}
+
+}
--- a/source/net/yacy/document/parser/XZParser.java
+++ b/source/net/yacy/document/parser/XZParser.java
@ -0,0 +1,66 @@
+// XZParser.java
+// ---------------------------
+// Copyright 2018 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
+import org.apache.commons.compress.compressors.xz.XZUtils;
+
+import net.yacy.kelondro.util.MemoryControl;
+
+/**
+ * Parser for xz archives. Uncompresses and parses the content and adds it to
+ * the created main parsed document.
+ *
+ * @see <a href="https://tukaani.org/xz/format.html">xz file format website</a>
+ */
+public class XZParser extends AbstractCompressorParser {
+
+	public XZParser() {
+		super("XZ Compressed Archive Parser");
+		this.SUPPORTED_EXTENSIONS.add("xz");
+		this.SUPPORTED_EXTENSIONS.add("txz");
+		this.SUPPORTED_MIME_TYPES.add("application/x-xz");
+	}
+
+	@Override
+	protected CompressorInputStream createDecompressStream(final InputStream source) throws IOException {
+		/*
+		 * Limit the size dedicated to reading compressed blocks to at most 25% of the
+		 * available memory. Eventual stricter limits should be handled by the caller
+		 * (see for example crawler.[protocol].maxFileSize configuration setting).
+		 */
+		final long availableMemory = MemoryControl.available();
+		final long maxKBytes = (long) (availableMemory * 0.25 / 1024.0);
+		return new XZCompressorInputStream(source, false, (int) Math.min(Integer.MAX_VALUE, maxKBytes));
+	}
+
+	@Override
+	protected String getUncompressedFilename(final String filename) {
+		return XZUtils.getUncompressedFilename(filename);
+	}
+
+}
--- a/test/java/net/yacy/document/parser/XZParserTest.java
+++ b/test/java/net/yacy/document/parser/XZParserTest.java
@ -0,0 +1,246 @@
+// XZParserTest.java
+// ---------------------------
+// Copyright 2018 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Collection;
+
+import org.junit.Test;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.document.Document;
+import net.yacy.document.Parser.Failure;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * Unit tests for the {@link XZParser} class
+ * 
+ * @author luccioman
+ *
+ */
+public class XZParserTest {
+
+	/** Folder containing test files */
+	private static final File TEST_FOLER = new File("test", "parsertest");
+
+	/**
+	 * Unit test for the XZParser.parse() function with some small bzip2 test files.
+	 * 
+	 * @throws Failure              when a file could not be parsed
+	 * @throws InterruptedException when the test was interrupted before its
+	 *                              termination
+	 * @throws IOException          when a read/write error occurred
+	 */
+	@Test
+	public void testParse() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
+		final XZParser parser = new XZParser();
+
+		for (final String fileName : fileNames) {
+			final DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+				final Document[] documents = parser.parse(location, "application/x-xz", StandardCharsets.UTF_8.name(),
+						new VocabularyScraper(), 0, inStream);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				final Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
+				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+			}
+		}
+	}
+
+	/**
+	 * Testing parse integration with the tar parser on a test txz archive.
+	 * 
+	 * @throws Failure              when a file could not be parsed
+	 * @throws InterruptedException when the test was interrupted before its
+	 *                              termination
+	 * @throws IOException          when a read/write error occurred
+	 */
+	@Test
+	public void testParseTxz() throws Failure, InterruptedException, IOException {
+		final String fileName = "umlaute_html_xml_txt_gnu.txz";
+		final XZParser parser = new XZParser();
+
+		final DigestURL location = new DigestURL("http://localhost/" + fileName);
+		try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+			final Document[] documents = parser.parse(location, "application/x-xz", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream);
+
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+			final String parsedText = documents[0].getTextString();
+			assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+			assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+					parsedText.contains("Maßkrügen"));
+			assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+			assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+			assertTrue(parsedText.contains("URL reference in raw text file"));
+			assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
+
+			final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+			assertNotNull(detectedAnchors);
+			assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
+					detectedAnchors.size());
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+		}
+	}
+
+	/**
+	 * Unit test for the XZParser.parseWithLimits() function with some small xz test
+	 * files which content is within limits.
+	 * 
+	 * @throws Failure              when a file could not be parsed
+	 * @throws InterruptedException when the test was interrupted before its
+	 *                              termination
+	 * @throws IOException          when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimits() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
+		final XZParser parser = new XZParser();
+
+		for (final String fileName : fileNames) {
+			final DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+				final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
+						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000, 10000);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				final Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertNotNull("Detected URLs must not be null for file " + fileName, anchors);
+				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+				assertFalse("Parse document must not be marked as partially parsed for file " + fileName,
+						documents[0].isPartiallyParsed());
+			}
+		}
+
+	}
+
+	/**
+	 * Unit test for the XZParser.parseWithLimits() when maxLinks limit is exceeded
+	 * 
+	 * @throws Failure              when a file could not be parsed
+	 * @throws InterruptedException when the test was interrupted before its
+	 *                              termination
+	 * @throws IOException          when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
+		final XZParser parser = new XZParser();
+
+		/* maxLinks limit exceeded */
+		for (final String fileName : fileNames) {
+			final DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+				final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
+						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				final Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+				assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+						documents[0].isPartiallyParsed());
+			}
+		}
+	}
+
+	/**
+	 * Unit test for the XZParser.parseWithLimits() when maxBytes limit is exceeded
+	 * 
+	 * @throws Failure              when a file could not be parsed
+	 * @throws InterruptedException when the test was interrupted before its
+	 *                              termination
+	 * @throws IOException          when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
+		final XZParser parser = new XZParser();
+
+		String fileName = fileNames[0];
+
+		DigestURL location = new DigestURL("http://localhost/" + fileName);
+		try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+			/*
+			 * The bytes limit is set to let parsing the beginning text part, but stop
+			 * before reaching the <a> tag
+			 */
+			final long maxBytes = 258;
+			final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
+					StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+					documents[0].getTextString().contains("Maßkrügen"));
+			final Collection<AnchorURL> anchors = documents[0].getAnchors();
+			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+					documents[0].isPartiallyParsed());
+		}
+
+		fileName = fileNames[1];
+		location = new DigestURL("http://localhost/" + fileName);
+		try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+			/*
+			 * The bytes limit is set to let parsing the beginning of the text, but stop
+			 * before reaching the URL
+			 */
+			final long maxBytes = 65;
+			final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
+					StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+					documents[0].getTextString().contains("Maßkrügen"));
+			final Collection<AnchorURL> anchors = documents[0].getAnchors();
+			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+					documents[0].isPartiallyParsed());
+		}
+	}
+
+}
--- a/test/parsertest/umlaute_html_utf8.html.xz
+++ b/test/parsertest/umlaute_html_utf8.html.xz
--- a/test/parsertest/umlaute_html_xml_txt_gnu.txz
+++ b/test/parsertest/umlaute_html_xml_txt_gnu.txz
--- a/test/parsertest/umlaute_linux.txt.xz
+++ b/test/parsertest/umlaute_linux.txt.xz