From acab6a6defb3307d27fb97004f99e0be4be8f55f Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Mon, 14 Aug 2017 14:47:01 +0200
Subject: [PATCH 1/2] Also handle text content when parsing XML within limits.

---
 source/net/yacy/document/parser/GenericXMLParser.java  | 10 ++++++++--
 .../net/yacy/document/parser/GenericXMLParserTest.java |  8 ++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/source/net/yacy/document/parser/GenericXMLParser.java b/source/net/yacy/document/parser/GenericXMLParser.java
index 0673260e6..25d429143 100644
--- a/source/net/yacy/document/parser/GenericXMLParser.java
+++ b/source/net/yacy/document/parser/GenericXMLParser.java
@@ -193,11 +193,17 @@ public class GenericXMLParser extends AbstractParser implements Parser {
 			} catch(StreamLimitException e) {
 				limitExceeded = true;
 			}
+			
+			if (writer.isOverflow()) {
+				throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
+						+ Formatter.bytesToString(availableMemory), location);
+			}
 
 
-			/* create the parsed document with empty text content */
+			/* Create the parsed document with eventually only partial part of the text and links */
+			final byte[] contentBytes = UTF8.getBytes(writer.toString());
 			Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
-					null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) };
+					null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
 			docs[0].setPartiallyParsed(limitExceeded);
 			return docs;
 		} catch (final Exception e) {
diff --git a/test/java/net/yacy/document/parser/GenericXMLParserTest.java b/test/java/net/yacy/document/parser/GenericXMLParserTest.java
index d4d6affe4..18b6cb438 100644
--- a/test/java/net/yacy/document/parser/GenericXMLParserTest.java
+++ b/test/java/net/yacy/document/parser/GenericXMLParserTest.java
@@ -390,6 +390,8 @@ public class GenericXMLParserTest {
 			assertEquals(1, documents.length);
 			assertFalse(documents[0].isPartiallyParsed());
 			
+			assertTrue(documents[0].getTextString().contains("And this is a relative link"));
+			
 			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals(5, detectedAnchors.size());
@@ -410,6 +412,9 @@ public class GenericXMLParserTest {
 			assertEquals(1, documents.length);
 			assertTrue(documents[0].isPartiallyParsed());
 			
+			assertTrue(documents[0].getTextString().contains("Home page"));
+			assertFalse(documents[0].getTextString().contains("And this is a relative link"));
+			
 			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals(2, detectedAnchors.size());
@@ -447,6 +452,9 @@ public class GenericXMLParserTest {
 			assertEquals(1, documents.length);
 			assertTrue(documents[0].isPartiallyParsed());
 			
+			assertTrue(documents[0].getTextString().contains("and this is a mention to a relative URL"));
+			assertFalse(documents[0].getTextString().contains("And this is a relative link to another"));
+			
 			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals(3, detectedAnchors.size());

From 780173008e5757572b9675f397b26a8d597ec3f4 Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Mon, 14 Aug 2017 14:57:58 +0200
Subject: [PATCH 2/2] Implemented partial stream parsing of tar archives.

Also added JUnit tests for the tar parser and fixed unwanted use of the
tar parser as a fallback on files included in a tar archive.
---
 .../net/yacy/document/parser/tarParser.java   | 187 +++++++++++--
 .../yacy/document/parser/tarParserTest.java   | 253 ++++++++++++++++++
 test/parsertest/umlaute_dc_xml_iso.xml        |   1 +
 test/parsertest/umlaute_dc_xml_utf8.xml       |   1 +
 test/parsertest/umlaute_html_iso.html         |   1 +
 test/parsertest/umlaute_html_utf8.html        |   3 +-
 test/parsertest/umlaute_html_xml_txt_gnu.tar  | Bin 0 -> 10240 bytes
 test/parsertest/umlaute_html_xml_txt_pax.tar  | Bin 0 -> 20480 bytes
 .../parsertest/umlaute_html_xml_txt_ustar.tar | Bin 0 -> 10240 bytes
 test/parsertest/umlaute_html_xml_txt_v7.tar   | Bin 0 -> 10240 bytes
 test/parsertest/umlaute_linux.txt             |   1 +
 11 files changed, 420 insertions(+), 27 deletions(-)
 create mode 100644 test/java/net/yacy/document/parser/tarParserTest.java
 create mode 100644 test/parsertest/umlaute_html_xml_txt_gnu.tar
 create mode 100644 test/parsertest/umlaute_html_xml_txt_pax.tar
 create mode 100644 test/parsertest/umlaute_html_xml_txt_ustar.tar
 create mode 100644 test/parsertest/umlaute_html_xml_txt_v7.tar

diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java
index 815497beb..d658364a4 100644
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@@ -29,11 +29,14 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.RandomAccessFile;
+import java.net.MalformedURLException;
 import java.util.Date;
 import java.util.zip.GZIPInputStream;
 
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+
 import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.document.AbstractParser;
@@ -43,9 +46,6 @@ import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;
 
-import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
-
 // this is a new implementation of this parser idiom using multiple documents as result set
 /**
  * Parses the tar file and each contained file,
@@ -75,6 +75,8 @@ public class tarParser extends AbstractParser implements Parser {
 
         final String filename = location.getFileName();
         final String ext = MultiProtocolURL.getFileExtension(filename);
+        final DigestURL parentTarURL = createParentTarURL(location);
+        // TODO is this hack really useful ? These extensions are already handled by the gzipParser
         if (ext.equals("gz") || ext.equals("tgz")) {
             try {
                 source = new GZIPInputStream(source);
@@ -85,26 +87,8 @@ public class tarParser extends AbstractParser implements Parser {
         TarArchiveEntry entry;
         final TarArchiveInputStream tis = new TarArchiveInputStream(source);
         
-        // create maindoc for this bzip container
-        final Document maindoc = new Document(
-                    location,
-                    mimeType,
-                    charset,
-                    this,
-                    null,
-                    null,
-                    AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
-                    null,
-                    null,
-                    null,
-                    null,
-                    0.0d, 0.0d,
-                    (Object) null,
-                    null,
-                    null,
-                    null,
-                    false,
-                    new Date());
+        // create maindoc for this tar container
+        final Document maindoc = createMainDocument(location, mimeType, charset, this);
         // loop through the elements in the tar file and parse every single file inside
         while (true) {
             try {
@@ -118,8 +102,18 @@ public class tarParser extends AbstractParser implements Parser {
                 try {
                     tmp = FileUtils.createTempFile(this.getClass(), name);
                     FileUtils.copy(tis, tmp, entry.getSize());
-                    final Document[] subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp);
-                    if (subDocs == null) continue;
+					/*
+					 * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. 
+					 * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name.
+					 * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the
+					 * extension of the URL is still ".tar", thus incorrectly making the tar parser
+					 * as a possible parser for the sub resource.
+					 */
+                    final DigestURL subLocation = new DigestURL(parentTarURL, name);
+                    final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, scraper, timezoneOffset,	999, tmp);
+                    if (subDocs == null) {
+                    	continue;
+                    }
                     maindoc.addSubDocuments(subDocs);
                 } catch (final Parser.Failure e) {
                     AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
@@ -134,6 +128,147 @@ public class tarParser extends AbstractParser implements Parser {
         return new Document[]{maindoc};
     }
 
+	@Override
+	public boolean isParseWithLimitsSupported() {
+		return true;
+	}
+
+	@Override
+	public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
+			final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
+			final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {
+
+		final DigestURL parentTarURL = createParentTarURL(location);
+
+		final TarArchiveInputStream tis = new TarArchiveInputStream(source);
+
+		// create maindoc for this tar container
+		final Document maindoc = createMainDocument(location, mimeType, charset, this);
+
+		// loop through the elements in the tar file and parse every single file inside
+		TarArchiveEntry entry;
+		int totalProcessedLinks = 0;
+		while (true) {
+			try {
+				entry = tis.getNextTarEntry();
+				if (entry == null) {
+					break;
+				}
+
+				/*
+				 * We are here sure at least one entry has still to be processed : let's check
+				 * now the bytes limit as sub parsers applied on eventual previous entries may
+				 * not support partial parsing and would have thrown a Parser.Failure instead of
+				 * marking the document as partially parsed.
+				 */
+				if (tis.getBytesRead() >= maxBytes) {
+					maindoc.setPartiallyParsed(true);
+					break;
+				}
+
+				if (entry.isDirectory() || entry.getSize() <= 0) {
+					continue;
+				}
+				final String name = entry.getName();
+				final int idx = name.lastIndexOf('.');
+				final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
+				try {
+					/*
+					 * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
+					 * compressed content
+					 */
+
+					/*
+					 * Create an appropriate sub location to prevent unwanted fallback to the
+					 * tarparser on resources included in the archive. We use the tar file name as
+					 * the parent sub path. Example : http://host/archive.tar/name. Indeed if we
+					 * create a sub location with a '#' separator such as
+					 * http://host/archive.tar#name, the extension of the URL is still ".tar", thus
+					 * incorrectly making the tar parser as a possible parser for the sub resource.
+					 */
+					final DigestURL subLocation = new DigestURL(parentTarURL, name);
+					final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
+							entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());
+
+					/*
+					 * If the parser(s) did not consume all bytes in the entry, these ones will be
+					 * skipped by the next call to getNextTarEntry()
+					 */
+					if (subDocs == null) {
+						continue;
+					}
+					maindoc.addSubDocuments(subDocs);
+					for (Document subDoc : subDocs) {
+						if (subDoc.getAnchors() != null) {
+							totalProcessedLinks += subDoc.getAnchors().size();
+						}
+					}
+					/*
+					 * Check if a limit has been exceeded (we are sure to pass here when maxLinks
+					 * has been exceeded as this limit require parser support for partial parsing to
+					 * be detected)
+					 */
+					if (subDocs[0].isPartiallyParsed()) {
+						maindoc.setPartiallyParsed(true);
+						break;
+					}
+				} catch (final Parser.Failure e) {
+					AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
+				}
+			} catch (final IOException e) {
+				AbstractParser.log.warn("tar parser:" + e.getMessage());
+				break;
+			}
+		}
+		return new Document[] { maindoc };
+	}
+
+	/**
+	 * Generate a parent URL to use for generating sub URLs on tar archive entries.
+	 * 
+	 * @param tarURL
+	 *            the URL of the tar archive
+	 * @return an URL ending with a "/" suitable as a base URL for archive entries
+	 */
+	private DigestURL createParentTarURL(final DigestURL tarURL) {
+		String locationStr = tarURL.toNormalform(false);
+		if (!locationStr.endsWith("/")) {
+			locationStr += "/";
+		}
+		DigestURL parentTarURL;
+		try {
+			parentTarURL = new DigestURL(locationStr);
+		} catch (MalformedURLException e1) {
+			/* This should not happen */
+			parentTarURL = tarURL;
+		}
+		return parentTarURL;
+	}
+
+	/**
+	 * Create the main resulting parsed document for a tar container
+	 * 
+	 * @param location
+	 *            the parsed resource URL
+	 * @param mimeType
+	 *            the media type of the resource
+	 * @param charset
+	 *            the charset name if known
+	 * @param an
+	 *            instance of tarParser that is registered as the parser origin of
+	 *            the document
+	 * @return a Document instance
+	 */
+	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
+			final tarParser parser) {
+		final String filename = location.getFileName();
+		final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
+				AbstractParser
+						.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
+				null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
+		return maindoc;
+	}
+
     public final static boolean isTar(File f) {
         if (!f.exists() || f.length() < 0x105) return false;
         RandomAccessFile raf = null;
diff --git a/test/java/net/yacy/document/parser/tarParserTest.java b/test/java/net/yacy/document/parser/tarParserTest.java
new file mode 100644
index 000000000..124ac5ffd
--- /dev/null
+++ b/test/java/net/yacy/document/parser/tarParserTest.java
@@ -0,0 +1,253 @@
+// tarParserTest.java
+// ---------------------------
+// Copyright 2017 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Collection;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.document.Document;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * Unit tests for the {@link tarParser} class
+ * 
+ * @author luccioman
+ *
+ */
+public class tarParserTest {
+
+	/** The test resources folder */
+	private final static File TEST_FOLDER = new File("test" + File.separator + "parsertest" + File.separator);
+
+	/**
+	 * All these test archives include two html test files in a sub folder, then a
+	 * xml and a text test files at the root
+	 */
+	private static final String[] TAR_FILE_NAMES = { "umlaute_html_xml_txt_gnu.tar", // created with tar option
+																						// --format=gnu
+			"umlaute_html_xml_txt_pax.tar", // created with tar option --format=pax
+			"umlaute_html_xml_txt_ustar.tar", // created with tar option --format=ustar
+			"umlaute_html_xml_txt_v7.tar", // created with tar option --format=v7
+	};
+
+	/** Tar parser test instance */
+	private tarParser parser;
+
+	@Before
+	public void setUp() {
+		this.parser = new tarParser();
+	}
+
+	/**
+	 * Unit test for the tarParser.parse() implementation with some test archives in
+	 * various common tar formats.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParse() throws Exception {
+
+		for (String fileName : TAR_FILE_NAMES) {
+			FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try {
+				Document[] documents = this.parser.parse(location, "application/tar", null, new VocabularyScraper(), 0,
+						inStream);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+				final String parsedText = documents[0].getTextString();
+				assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+						parsedText.contains("Maßkrügen"));
+				assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+				assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+				assertTrue(parsedText.contains("URL reference in raw text file"));
+				assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
+
+				final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+				assertNotNull(detectedAnchors);
+				assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
+						detectedAnchors.size());
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * Test tarParser.parseWithLimits() with limits not reached.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseWithLimitsNotReached() throws Exception {
+		for (String fileName : TAR_FILE_NAMES) {
+
+			FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+			/* Content within limits */
+			try {
+				Document[] documents = this.parser.parseWithLimits(location, "application/tar", null,
+						new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, Long.MAX_VALUE);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+				final String parsedText = documents[0].getTextString();
+				assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+						parsedText.contains("Maßkrügen"));
+				assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+				assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+				assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
+				assertTrue(parsedText.contains("URL reference in raw text file"));
+
+				final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+				assertNotNull(detectedAnchors);
+				assertEquals("Parsed URLs must contain all URLs from each test file included in the archive", 5,
+						detectedAnchors.size());
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * Test tarParser.parseWithLimits() with links limit exceeded
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseWithLimitsLinksExceeded() throws Exception {
+		for (String fileName : TAR_FILE_NAMES) {
+
+			FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+
+			/* Links limit exceeded from the third included file */
+			try {
+				Document[] documents = this.parser.parseWithLimits(location, "application/tar", null,
+						new VocabularyScraper(), 0, inStream, 2, Long.MAX_VALUE);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+				final String parsedText = documents[0].getTextString();
+				assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+						parsedText.contains("Maßkrügen"));
+				assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+				assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+				assertFalse(parsedText.contains("UTF-8 encoded XML test file"));
+				assertFalse(parsedText.contains("URL reference in raw text file"));
+
+				final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+				assertNotNull(detectedAnchors);
+				assertEquals("Parsed URLs must only contain URLs from test files withing links limit", 2,
+						detectedAnchors.size());
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * Test tarParser.parseWithLimits() with bytes limit exceeded
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseWithLimitsBytesExceeded() throws Exception {
+		for (String fileName : TAR_FILE_NAMES) {
+
+			FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+
+			/* Bytes limit exceeded from the third included file. */
+			final long maxBytes;
+			if ("umlaute_html_xml_txt_pax.tar".equals(fileName)) {
+				/* pax tar format uses more bytes for extended headers */
+				maxBytes = 7000;
+			} else {
+				/*
+				 * Limit calculation : five 512 bytes tar records = 512 bytes tar header for the
+				 * html directory + (2 x (512 bytes tar header + html file content below 512
+				 * bytes, thus rounded to 512))
+				 */
+				maxBytes = 512 * 5;
+			}
+			try {
+				Document[] documents = this.parser.parseWithLimits(location, "application/tar", null,
+						new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+				final String parsedText = documents[0].getTextString();
+				assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+						parsedText.contains("Maßkrügen"));
+				assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+				assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+				assertFalse(parsedText.contains("URL reference in raw text file"));
+				assertFalse(parsedText.contains("UTF-8 encoded XML test file"));
+
+				final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+				assertNotNull(detectedAnchors);
+				assertEquals("Parsed URLs must only contain URLs from test files withing bytes limit", 2,
+						detectedAnchors.size());
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+}
diff --git a/test/parsertest/umlaute_dc_xml_iso.xml b/test/parsertest/umlaute_dc_xml_iso.xml
index 3524be737..897862eec 100644
--- a/test/parsertest/umlaute_dc_xml_iso.xml
+++ b/test/parsertest/umlaute_dc_xml_iso.xml
@@ -3,6 +3,7 @@
 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
          xmlns:dc="http://purl.org/dc/elements/1.1/">
   <rdf:Description>
+    <dc:title>ISO-8859-1 encoded XML test file</dc:title>
     <dc:description>In M�nchen steht ein Hofbr�uhaus, dort gibt es Bier in Ma�kr�gen</dc:description>
   </rdf:Description>
 </rdf:RDF>
\ No newline at end of file
diff --git a/test/parsertest/umlaute_dc_xml_utf8.xml b/test/parsertest/umlaute_dc_xml_utf8.xml
index 71744f3f1..785d80005 100644
--- a/test/parsertest/umlaute_dc_xml_utf8.xml
+++ b/test/parsertest/umlaute_dc_xml_utf8.xml
@@ -3,6 +3,7 @@
 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
          xmlns:dc="http://purl.org/dc/elements/1.1/">
   <rdf:Description>
+    <dc:title>UTF-8 encoded XML test file</dc:title>
     <dc:description>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</dc:description>
   </rdf:Description>
 </rdf:RDF>
\ No newline at end of file
diff --git a/test/parsertest/umlaute_html_iso.html b/test/parsertest/umlaute_html_iso.html
index de56c7116..1e18fde75 100644
--- a/test/parsertest/umlaute_html_iso.html
+++ b/test/parsertest/umlaute_html_iso.html
@@ -6,5 +6,6 @@
 <body>
 In M�nchen steht ein Hofbr�uhaus.
 Dort gibt es Bier aus Ma�kr�gen.<br>
+<a href="http://localhost/umlaute_html_iso.html">Example link in ISO-8859-1 encoded HTML</a>
 </body>
 </html>
diff --git a/test/parsertest/umlaute_html_utf8.html b/test/parsertest/umlaute_html_utf8.html
index 8954c5c6a..9fcbf9175 100644
--- a/test/parsertest/umlaute_html_utf8.html
+++ b/test/parsertest/umlaute_html_utf8.html
@@ -1,10 +1,11 @@
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
 <html>
 <head>
-<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
 </head>
 <body>
 In M&#252;nchen steht ein Hofbr&#228;uhaus.
 Dort gibt es Bier aus Ma&#223;kr&#252;gen.<br>
+<a href="http://localhost/umlaute_html_utf8.html">Example link in UTF-8 encoded HTML</a>
 </body>
 </html>
diff --git a/test/parsertest/umlaute_html_xml_txt_gnu.tar b/test/parsertest/umlaute_html_xml_txt_gnu.tar
new file mode 100644
index 0000000000000000000000000000000000000000..e454d3ea4b03789d6351d91cfccece71f4589d3e
GIT binary patch
literal 10240
zcmeHK%}(1u5Y9PIu{Bapwb$#|asH7?Kp>HVAgJYp9D9>kaqM)xCgEv%>01=3hdf9x
zz4rw;bk@OX5NRSHv{c=dEZaM?v*T~(+Zp%yFu=Lz9Xbr#vT|@EwxuuLg-DDXDG<{%
zOq1A!oKAEC>d1Otw`SIpm^%!iT#&dcw_7J)x%lM@`y&3yFmMu1FT~45!n-9+JZ;Qr
z5#XX}F2+5}10m8a)4Gkn1(6r=H!a=FA$?;WtT6PS$G`Tfd9>3$Ki)<A?babWK6`t3
zu!9s8<M)LfjGOJ|Okip{L2c%QvCsV|bOMZb-zklJO=OC=QOASZka7q0IUlL?W8z=d
z6*mew4Y|s%MpQw!Z*_&!3CCh-6}f$f#gy02+IwnASj4xsI+1tP$RC8L^?J*&jB4oi
zX^3J@`y5d}MEg;<!$4$|s!88TVlCf{7)L$71M(QX^(jLzjarUSDpUtT8SM0EsMR_Q
zb^=!!?ba3Ha0TNaa-E<b#r!vfP#U`vXE+Kd3jA<@z=hOjL_;_7sFwm&!;bibneXr^
zDLO0C?{ABQgLgmIkE6}fU+{i4{|iQT{uc{CeBi$c$AIMjA4rau!~cWRBehhr$|_lY
z2;3~?|MxZ2|2AMZ4<&rYfct#^<>Nn1`h~p8O26k`OtRt+4+|Pke_Y4^qHSmOUpL__
zfF8gmy5#@Ukza42whhrnmz2d&)$0n;bY+&>e%PxkvzojO6>Z)oFt1xVY3`xZ^V9Zj
z3uU#_OFB>#-H1^wVm)j?%rP;rPE^kw(pZIERF8WaT!P-26_^g=3cSsW@OV7d#)Z@f
zDVNJwH?U!-prOWBA$KNf7;h<gl#Q7I&z-LyB`io+doHE{9YXUFW1<lZ+ai>1+@!I~
z{E_Hm#2n0eZiV|ippCl){R4DFoW>k={eaf+Tx-6_Td*V=mFe};Lu&dtJ*D0t?_Prf
zO+?F?ej2dpH5^mv9*ahW%UFPQ$LHA;M4_=hz-u%z{zLGRiN+^<js9cAg8!@j{|u7#
z|E8_Ozkp7RqG?P1KL+cEuKDrh>n0Lc%b%SbB50;5gKn6L-k3RKAkSn*9#l#H3YIq6
z^VG{f)Oo2d5s(N-1SA3y0f~S_Kq4R!kO)WwBmxoviGV~vA|Mfv2uK7Z0ulj<z)}Q$
E0&~tC5&!@I

literal 0
HcmV?d00001

diff --git a/test/parsertest/umlaute_html_xml_txt_pax.tar b/test/parsertest/umlaute_html_xml_txt_pax.tar
new file mode 100644
index 0000000000000000000000000000000000000000..3293be4ceef40fd2c367440b6106a8c5c84dc8b5
GIT binary patch
literal 20480
zcmeHM&2Hm15cb(m!D@k?DpAz0mDpKe<17*&=>{7I?VbWfVqzh(luFdvd757O7G0o+
zJV-CS_XYOQp&U7m<ovCjG)+f<L5m}j!%zJ9aYh>5_b1!j4|$erj^i>q6r;Gc(o2WS
zv8<LX;i{K9GmREAnPnN4#ay$cGiEz>i%eGP#Vwr`x$rY^=@#>AuQqiu648kFn5`R*
zXIh@8!8@{T+x0qau=-7_b!`or)~fAmNA7jZPqLjp@l+(~UE7wtXRclHzbd6kUUmM%
z5^E9riTx_OulOGq!JWh1XP?gFdULz8vzKEm-+x^Kic#zrf*(oaQ6Ww`+ESt*p?TQ;
zZ!-`B*#8~f)N%iRj#X0I|8385HP@*2@Y?=AwVKZ2{lC`sIlRX)mPMfC|BCru_i2v*
z)xKYq<*sR4_SOD>&41l$k!LxR{m=LR?sxqUn}bjLTV#8%w@da9-|g;f5=Eu-Cv%h1
z{-8fKSenkrAoG(v5>cA?F{N8SDjV&tTtKpb5CV!37d{yZajx=Tis*As3DQLHM5yB8
zoGaw&uBQk-5oM&YPJ*GI<y`a*2k+I6?2%seb)1G58||Hh?0x^nu#NR381jVVf)54Z
zQ9`!Ulj96bMrXYk`bDm_`)MY~X><(soV<&8MxdJP`SKxi{Y*X#{habd>mFw?61bZ2
zlb#|SUZXTl13w<7xuA;)p=@kT{Ly*LNgO3-1Y9V6COiq!kcTBuUFypzOnrx`l<3q6
z@q+G4{@<zpQJyX*|7E=QF#We26Ns-f+qF#8|L0gHHT}12&(R!XF2b9S|G?^LmT4^3
z|61GU@E*rl7J;VzFVFY7PxtEob^d3@e+UCm|LZ_L!Trn8|DA&mYNum+DqDOA__|R4
z|GG!||2)gSK34E61~m2l+Q<L2=+CVqt&#t&&OTvqG^xUW_%qk$v47|IFaI|3zq)Hx
z=YJj4Me4ulV*LMDn7`@ydW7$qOw)mev;6OThSx5wZl(X<G$1y5Q3$M_?ak@U@xO}u
zt5TYD$93!cHyH3A<Nqd>tFGz3g}t48hOd7R)%O%2zcQ7!C^_vZ)0q4%M6|OsK^C5@
zeeAy{2cHfGTYIF6ox|c7g5n^}xRz$8)PUDYnL%}?hQWyEDvY9rBGlj~<-O_!s*`*T
z?q)&wcs$m|X4whzJdf%IH4GJ=Q1gpK_!BkB-zaTTUDFQ2VAlS;$l|hf7*HPb5hNdS
z$}~n{SVGEy`#cY_=v?M8WF6Fo!J3Ff%r|ZZ^glyJMDSdYlPKn0I(ur?D4gp_CMqt!
zEIg!K{#Kq+ejs5ZK><mGJ^%98v+VK<98=0M<~x-xQwi1$pQlyu7B-e%4!7$+yv|~x
ziHTTxjFRII`~K_dkPFnAWttfOFOB;{I;hA0p5<xIe1`u<{9js~TTTbo&9)C29iREP
z5vcTkb!>0WNb>)#@4p7C^8Z%Fe`dL;|4k%UU-S3xf2=~dwD#f0T>{B;o<TO8%iMV8
zk3l>qQ}HBg<yV1)N%q;&Yp;HK*dro<2p|H803v`0AOeU0B7g`W0*C-2fCwN0hyWsh
x2p|H803v`0AOeU0B7g`W0*C-2fCwN0hyWsh2p|H803v`0AOeU0BJipZ_y<p14S)au

literal 0
HcmV?d00001

diff --git a/test/parsertest/umlaute_html_xml_txt_ustar.tar b/test/parsertest/umlaute_html_xml_txt_ustar.tar
new file mode 100644
index 0000000000000000000000000000000000000000..fb7c004e7481b15e1936a95a6d11b457985b2cd2
GIT binary patch
literal 10240
zcmeHK%}(1u5Y9PIu{Bapwb$#|`A?8aKp>HVAgJYp9D9>kaqM)xCgEv%>01=3hdf9x
zz4rw;bk@dc5NRSHv{c=dthIM%XJ@~eZ)ZH<BOm9Ux9QMImX(8!lq`L*7gM6;NP(E9
zVVa~=$mxWbAdal(H7nH~N8Dy$k{>%Ow_7)#o+CM2a-fyUUe4q|_&*-`cFgI8__&C9
zudIo;jTtHMD4OP?-?KPKP_s<y*8djKiSU0>*UcQ#H)g>KMgO_~>#tfzJDv06U9{h6
zAEM*4w}%HiNKr9<U)aI8)oG;@rlu3rVRjIC+zSKS$9VUh(#+RIq==5%E_5TxZ8YF~
ztkRFMciB*!FyJ)cD!&?21>Jr%6i%lci={Q>3~Uxr-Z<;*sb!%N-|Fgy?o~5?5TN$!
zEyFTufis{1iZ~r`M7;p*hrKR?DWhDA2X-83`Bum{>U&+7kI-9>G6ch@Z4047Z7766
zPoD-_y~|)HFqP3>LlFj7G4?~p_6K3ae?thRxjVH-W1k}53x)_xNNh$la6*^52~c%x
zizB4A!%-4+R)pW*o)QM$J+2ppo29?t{c8RfjO_j|7J&FVu}W|aNdEtU;CMOwKR7*7
z%Vn#glI54c%~Jk<Uqk(GLw55}!e<P)&-Y(8{?nvih^s91yUxWlOaAb(Ao29qb^I@u
zN?HEbO}Gny{|3<||DTThdJXk$@HV=nEP|}wP>7~0X=r;vzoDc#c^fj?yi8zjuX@tj
zL#OAbo!vIda;F=2At^c`qgu%N*Z`knVql%9t}~*M3cIK-cQtqfy)!K^97I+4nkV7O
zWTH(9i4sz&RIqMf!%#s&jjjT2Pt_pWQt~KksRGxTuOG+EPgc7Qram1(@eyI75e(ZR
zlx*Chk;A;PsAI$!jJi&hd)%kZy9xaRR79Lc9Q8b(*700wzQ|qBBnp+;_0mgf_Bpww
z-XQm0fdWND+n#+Ive`9UQ^_8SN`=W-fOW^_bO^lAT<_pD5*h!&dGS=^Q@%$2G1G$o
ztN#BClGXoaNr!&{otVTdN&Y_u>xZuS@#X6#0$0nQog5-4rYVDJn2Oq%*%KhoG$jwJ
zCVvG>i|l#m<sWLil$Qud1SA3y0f~S_Kq4R!kO)WwBmxoviGV~vA|Mfv2uK7Z0ulj<
KfJ9&^0zUz~fhy4e

literal 0
HcmV?d00001

diff --git a/test/parsertest/umlaute_html_xml_txt_v7.tar b/test/parsertest/umlaute_html_xml_txt_v7.tar
new file mode 100644
index 0000000000000000000000000000000000000000..6009dc8b59b8290ef0739ae05dfe1b332e5c8db4
GIT binary patch
literal 10240
zcmeHK%}(1u5Y9PIu{Bapwb$#|`A?8af<Ph#K~T#HIrb*8;@IhWO~TXk(zhs54|$MY
zdhZKx=&Xa&AW|a0L9MiwthIM%XJ)_IZ)bMM$3D(&_tD`gSym1jQnK{bRt$+JM+(F=
z4bvp00*se*qnJb1cHPQW`(ho*;*tert9@(Qh9ds)*tcU&FU8Mg%==|cd}U@+;88Tq
z)wrkoAVJMCt-JVJL?;k`QZPzZ4rS)RYbZ)q0vkDPU&CewHtKI$$9vt2lQuf&c8<`=
z`TL{8J*21@e=O`_-0HR#1E!`E)Ma)MdE5&F+sC;5L22geVyB3P+AcI>%55~{e4^6N
zv3J!_oG{=t;3~hKPzBxnH55)~9E+(n<P2>VQQkQ3?yF^?5#Q<Rh3<7Te;A<7+a1F)
zYJoGP0g5;sazwoV9fbWJgCV0_i-&d`Y57*jI2w387?03<k1_<^sACJELTx03LC=5&
zTD`}>61d7}zo7_+s~G#CWBbD};(sB8(rnM{@x-Ud_ks}u7ZRTl4V=)WZUR&t+hPd|
z-(e{UIvK=iTPlQu_m}HMVYc*7DDf!&3r2eW7YjO^6Tp9H(6-jR6}>6%({?LHvRC@=
z@c;1aSS^>Wib~cW0=H}V|8oQNe~j4eQwg6k;341t`1oIweq|e_so!-jXKC_>j}?iR
ze+o)G#{Xiel;(fkgtI^=hNTn9|Cb}bSwnpnqK&R7iy*5v6r$<MVrzTBprI^s@-Aew
zWtqU-e)Y7qkIpX6y6p~1bEg~kASpT_qgu!Y*np5@Vql%9t}~{Q3RYB?yBfTL-l-Po
z4x%djEtBwcI@P9yL<y->Dp)tLVW=RXM%MwiXKE1bD0!4V3k9yToIi<~pUie0Ono|r
z;v>RDBN)sglo)Q&$YI_@)G?wDdR?c=J?_)y{e=DrDk4rJj{2TY>v$=(oaC-(5{1hA
zX6+$0|C*dq?~wbTK!GBnW6!^g*!%{Lsl>*rQsFWdVBPb1(FIXxZVvDUiIo2kym+SZ
z8Q&oPoMFNL$L@ayN$Y>Jq{Cf6CvX?kCI6p;^;75k`u07GjjQF)Pmd53)09CqOhs+X
z>?x3Ewjd9xCbxpMMfP&*<)7-jl$Qud1SA3y0f~S_Kq4R!kO)WwBmxoviGV~vA|Mfv
S2uK7Z0ulj<fJ9&|0>1$iSrC>0

literal 0
HcmV?d00001

diff --git a/test/parsertest/umlaute_linux.txt b/test/parsertest/umlaute_linux.txt
index 62bb22c22..77033cbc9 100755
--- a/test/parsertest/umlaute_linux.txt
+++ b/test/parsertest/umlaute_linux.txt
@@ -1 +1,2 @@
 ﻿In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen
+URL reference in raw text file : http://localhost/umlaute_linux.txt