Merge branch 'master' of https://github.com/yacy/yacy_search_server.git

8 years ago · f45378c11c
parent 7f395ef937 780173008e
commit f45378c11c
13 changed files with 436 additions and 29 deletions
--- a/source/net/yacy/document/parser/GenericXMLParser.java
+++ b/source/net/yacy/document/parser/GenericXMLParser.java
@ -193,11 +193,17 @@ public class GenericXMLParser extends AbstractParser implements Parser {
 			} catch(StreamLimitException e) {
 				limitExceeded = true;
 			}
+			
+			if (writer.isOverflow()) {
+				throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
+						+ Formatter.bytesToString(availableMemory), location);
+			}


-			/* create the parsed document with empty text content */
+			/* Create the parsed document with eventually only partial part of the text and links */
+			final byte[] contentBytes = UTF8.getBytes(writer.toString());
 			Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
-					null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) };
+					null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
 			docs[0].setPartiallyParsed(limitExceeded);
 			return docs;
 		} catch (final Exception e) {
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@ -29,11 +29,14 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.RandomAccessFile;
+import java.net.MalformedURLException;
 import java.util.Date;
 import java.util.zip.GZIPInputStream;

+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+
 import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.document.AbstractParser;
@ -43,9 +46,6 @@ import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;

-import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
-
 // this is a new implementation of this parser idiom using multiple documents as result set
 /**
 * Parses the tar file and each contained file,
@ -75,6 +75,8 @@ public class tarParser extends AbstractParser implements Parser {

        final String filename = location.getFileName();
        final String ext = MultiProtocolURL.getFileExtension(filename);
+        final DigestURL parentTarURL = createParentTarURL(location);
+        // TODO is this hack really useful ? These extensions are already handled by the gzipParser
        if (ext.equals("gz") || ext.equals("tgz")) {
            try {
                source = new GZIPInputStream(source);
@ -85,26 +87,8 @@ public class tarParser extends AbstractParser implements Parser {
        TarArchiveEntry entry;
        final TarArchiveInputStream tis = new TarArchiveInputStream(source);
        
-        // create maindoc for this bzip container
-        final Document maindoc = new Document(
-                    location,
-                    mimeType,
-                    charset,
-                    this,
-                    null,
-                    null,
-                    AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
-                    null,
-                    null,
-                    null,
-                    null,
-                    0.0d, 0.0d,
-                    (Object) null,
-                    null,
-                    null,
-                    null,
-                    false,
-                    new Date());
+        // create maindoc for this tar container
+        final Document maindoc = createMainDocument(location, mimeType, charset, this);
        // loop through the elements in the tar file and parse every single file inside
        while (true) {
            try {
@ -118,8 +102,18 @@ public class tarParser extends AbstractParser implements Parser {
                try {
                    tmp = FileUtils.createTempFile(this.getClass(), name);
                    FileUtils.copy(tis, tmp, entry.getSize());
-                    final Document[] subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp);
-                    if (subDocs == null) continue;
+					/*
+					 * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. 
+					 * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name.
+					 * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the
+					 * extension of the URL is still ".tar", thus incorrectly making the tar parser
+					 * as a possible parser for the sub resource.
+					 */
+                    final DigestURL subLocation = new DigestURL(parentTarURL, name);
+                    final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, scraper, timezoneOffset,	999, tmp);
+                    if (subDocs == null) {
+                    	continue;
+                    }
                    maindoc.addSubDocuments(subDocs);
                } catch (final Parser.Failure e) {
                    AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
@ -134,6 +128,147 @@ public class tarParser extends AbstractParser implements Parser {
        return new Document[]{maindoc};
    }

+	@Override
+	public boolean isParseWithLimitsSupported() {
+		return true;
+	}
+
+	@Override
+	public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
+			final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
+			final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {
+
+		final DigestURL parentTarURL = createParentTarURL(location);
+
+		final TarArchiveInputStream tis = new TarArchiveInputStream(source);
+
+		// create maindoc for this tar container
+		final Document maindoc = createMainDocument(location, mimeType, charset, this);
+
+		// loop through the elements in the tar file and parse every single file inside
+		TarArchiveEntry entry;
+		int totalProcessedLinks = 0;
+		while (true) {
+			try {
+				entry = tis.getNextTarEntry();
+				if (entry == null) {
+					break;
+				}
+
+				/*
+				 * We are here sure at least one entry has still to be processed : let's check
+				 * now the bytes limit as sub parsers applied on eventual previous entries may
+				 * not support partial parsing and would have thrown a Parser.Failure instead of
+				 * marking the document as partially parsed.
+				 */
+				if (tis.getBytesRead() >= maxBytes) {
+					maindoc.setPartiallyParsed(true);
+					break;
+				}
+
+				if (entry.isDirectory() || entry.getSize() <= 0) {
+					continue;
+				}
+				final String name = entry.getName();
+				final int idx = name.lastIndexOf('.');
+				final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
+				try {
+					/*
+					 * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
+					 * compressed content
+					 */
+
+					/*
+					 * Create an appropriate sub location to prevent unwanted fallback to the
+					 * tarparser on resources included in the archive. We use the tar file name as
+					 * the parent sub path. Example : http://host/archive.tar/name. Indeed if we
+					 * create a sub location with a '#' separator such as
+					 * http://host/archive.tar#name, the extension of the URL is still ".tar", thus
+					 * incorrectly making the tar parser as a possible parser for the sub resource.
+					 */
+					final DigestURL subLocation = new DigestURL(parentTarURL, name);
+					final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
+							entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());
+
+					/*
+					 * If the parser(s) did not consume all bytes in the entry, these ones will be
+					 * skipped by the next call to getNextTarEntry()
+					 */
+					if (subDocs == null) {
+						continue;
+					}
+					maindoc.addSubDocuments(subDocs);
+					for (Document subDoc : subDocs) {
+						if (subDoc.getAnchors() != null) {
+							totalProcessedLinks += subDoc.getAnchors().size();
+						}
+					}
+					/*
+					 * Check if a limit has been exceeded (we are sure to pass here when maxLinks
+					 * has been exceeded as this limit require parser support for partial parsing to
+					 * be detected)
+					 */
+					if (subDocs[0].isPartiallyParsed()) {
+						maindoc.setPartiallyParsed(true);
+						break;
+					}
+				} catch (final Parser.Failure e) {
+					AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
+				}
+			} catch (final IOException e) {
+				AbstractParser.log.warn("tar parser:" + e.getMessage());
+				break;
+			}
+		}
+		return new Document[] { maindoc };
+	}
+
+	/**
+	 * Generate a parent URL to use for generating sub URLs on tar archive entries.
+	 * 
+	 * @param tarURL
+	 *            the URL of the tar archive
+	 * @return an URL ending with a "/" suitable as a base URL for archive entries
+	 */
+	private DigestURL createParentTarURL(final DigestURL tarURL) {
+		String locationStr = tarURL.toNormalform(false);
+		if (!locationStr.endsWith("/")) {
+			locationStr += "/";
+		}
+		DigestURL parentTarURL;
+		try {
+			parentTarURL = new DigestURL(locationStr);
+		} catch (MalformedURLException e1) {
+			/* This should not happen */
+			parentTarURL = tarURL;
+		}
+		return parentTarURL;
+	}
+
+	/**
+	 * Create the main resulting parsed document for a tar container
+	 * 
+	 * @param location
+	 *            the parsed resource URL
+	 * @param mimeType
+	 *            the media type of the resource
+	 * @param charset
+	 *            the charset name if known
+	 * @param an
+	 *            instance of tarParser that is registered as the parser origin of
+	 *            the document
+	 * @return a Document instance
+	 */
+	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
+			final tarParser parser) {
+		final String filename = location.getFileName();
+		final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
+				AbstractParser
+						.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
+				null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
+		return maindoc;
+	}
+
    public final static boolean isTar(File f) {
        if (!f.exists() || f.length() < 0x105) return false;
        RandomAccessFile raf = null;
--- a/test/java/net/yacy/document/parser/GenericXMLParserTest.java
+++ b/test/java/net/yacy/document/parser/GenericXMLParserTest.java
@ -390,6 +390,8 @@ public class GenericXMLParserTest {
 			assertEquals(1, documents.length);
 			assertFalse(documents[0].isPartiallyParsed());
 			
+			assertTrue(documents[0].getTextString().contains("And this is a relative link"));
+			
 			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals(5, detectedAnchors.size());
@ -410,6 +412,9 @@ public class GenericXMLParserTest {
 			assertEquals(1, documents.length);
 			assertTrue(documents[0].isPartiallyParsed());
 			
+			assertTrue(documents[0].getTextString().contains("Home page"));
+			assertFalse(documents[0].getTextString().contains("And this is a relative link"));
+			
 			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals(2, detectedAnchors.size());
@ -447,6 +452,9 @@ public class GenericXMLParserTest {
 			assertEquals(1, documents.length);
 			assertTrue(documents[0].isPartiallyParsed());
 			
+			assertTrue(documents[0].getTextString().contains("and this is a mention to a relative URL"));
+			assertFalse(documents[0].getTextString().contains("And this is a relative link to another"));
+			
 			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals(3, detectedAnchors.size());
--- a/test/java/net/yacy/document/parser/tarParserTest.java
+++ b/test/java/net/yacy/document/parser/tarParserTest.java
@ -0,0 +1,253 @@
+// tarParserTest.java
+// ---------------------------
+// Copyright 2017 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Collection;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.document.Document;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * Unit tests for the {@link tarParser} class
+ * 
+ * @author luccioman
+ *
+ */
+public class tarParserTest {
+
+	/** The test resources folder */
+	private final static File TEST_FOLDER = new File("test" + File.separator + "parsertest" + File.separator);
+
+	/**
+	 * All these test archives include two html test files in a sub folder, then a
+	 * xml and a text test files at the root
+	 */
+	private static final String[] TAR_FILE_NAMES = { "umlaute_html_xml_txt_gnu.tar", // created with tar option
+																						// --format=gnu
+			"umlaute_html_xml_txt_pax.tar", // created with tar option --format=pax
+			"umlaute_html_xml_txt_ustar.tar", // created with tar option --format=ustar
+			"umlaute_html_xml_txt_v7.tar", // created with tar option --format=v7
+	};
+
+	/** Tar parser test instance */
+	private tarParser parser;
+
+	@Before
+	public void setUp() {
+		this.parser = new tarParser();
+	}
+
+	/**
+	 * Unit test for the tarParser.parse() implementation with some test archives in
+	 * various common tar formats.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParse() throws Exception {
+
+		for (String fileName : TAR_FILE_NAMES) {
+			FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try {
+				Document[] documents = this.parser.parse(location, "application/tar", null, new VocabularyScraper(), 0,
+						inStream);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+				final String parsedText = documents[0].getTextString();
+				assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+						parsedText.contains("Maßkrügen"));
+				assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+				assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+				assertTrue(parsedText.contains("URL reference in raw text file"));
+				assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
+
+				final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+				assertNotNull(detectedAnchors);
+				assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
+						detectedAnchors.size());
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * Test tarParser.parseWithLimits() with limits not reached.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseWithLimitsNotReached() throws Exception {
+		for (String fileName : TAR_FILE_NAMES) {
+
+			FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+			/* Content within limits */
+			try {
+				Document[] documents = this.parser.parseWithLimits(location, "application/tar", null,
+						new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, Long.MAX_VALUE);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+				final String parsedText = documents[0].getTextString();
+				assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+						parsedText.contains("Maßkrügen"));
+				assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+				assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+				assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
+				assertTrue(parsedText.contains("URL reference in raw text file"));
+
+				final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+				assertNotNull(detectedAnchors);
+				assertEquals("Parsed URLs must contain all URLs from each test file included in the archive", 5,
+						detectedAnchors.size());
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * Test tarParser.parseWithLimits() with links limit exceeded
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseWithLimitsLinksExceeded() throws Exception {
+		for (String fileName : TAR_FILE_NAMES) {
+
+			FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+
+			/* Links limit exceeded from the third included file */
+			try {
+				Document[] documents = this.parser.parseWithLimits(location, "application/tar", null,
+						new VocabularyScraper(), 0, inStream, 2, Long.MAX_VALUE);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+				final String parsedText = documents[0].getTextString();
+				assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+						parsedText.contains("Maßkrügen"));
+				assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+				assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+				assertFalse(parsedText.contains("UTF-8 encoded XML test file"));
+				assertFalse(parsedText.contains("URL reference in raw text file"));
+
+				final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+				assertNotNull(detectedAnchors);
+				assertEquals("Parsed URLs must only contain URLs from test files withing links limit", 2,
+						detectedAnchors.size());
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * Test tarParser.parseWithLimits() with bytes limit exceeded
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseWithLimitsBytesExceeded() throws Exception {
+		for (String fileName : TAR_FILE_NAMES) {
+
+			FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+
+			/* Bytes limit exceeded from the third included file. */
+			final long maxBytes;
+			if ("umlaute_html_xml_txt_pax.tar".equals(fileName)) {
+				/* pax tar format uses more bytes for extended headers */
+				maxBytes = 7000;
+			} else {
+				/*
+				 * Limit calculation : five 512 bytes tar records = 512 bytes tar header for the
+				 * html directory + (2 x (512 bytes tar header + html file content below 512
+				 * bytes, thus rounded to 512))
+				 */
+				maxBytes = 512 * 5;
+			}
+			try {
+				Document[] documents = this.parser.parseWithLimits(location, "application/tar", null,
+						new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+				final String parsedText = documents[0].getTextString();
+				assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+						parsedText.contains("Maßkrügen"));
+				assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+				assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+				assertFalse(parsedText.contains("URL reference in raw text file"));
+				assertFalse(parsedText.contains("UTF-8 encoded XML test file"));
+
+				final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+				assertNotNull(detectedAnchors);
+				assertEquals("Parsed URLs must only contain URLs from test files withing bytes limit", 2,
+						detectedAnchors.size());
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+				assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+				assertFalse(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+}
--- a/test/parsertest/umlaute_dc_xml_iso.xml
+++ b/test/parsertest/umlaute_dc_xml_iso.xml
@ -3,6 +3,7 @@
 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:dc="http://purl.org/dc/elements/1.1/">
  <rdf:Description>
+    <dc:title>ISO-8859-1 encoded XML test file</dc:title>
    <dc:description>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</dc:description>
  </rdf:Description>
 </rdf:RDF>
--- a/test/parsertest/umlaute_dc_xml_utf8.xml
+++ b/test/parsertest/umlaute_dc_xml_utf8.xml
@ -3,6 +3,7 @@
 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:dc="http://purl.org/dc/elements/1.1/">
  <rdf:Description>
+    <dc:title>UTF-8 encoded XML test file</dc:title>
    <dc:description>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</dc:description>
  </rdf:Description>
 </rdf:RDF>
--- a/test/parsertest/umlaute_html_iso.html
+++ b/test/parsertest/umlaute_html_iso.html
@ -6,5 +6,6 @@
 <body>
 In München steht ein Hofbräuhaus.
 Dort gibt es Bier aus Maßkrügen.<br>
+<a href="http://localhost/umlaute_html_iso.html">Example link in ISO-8859-1 encoded HTML</a>
 </body>
 </html>
--- a/test/parsertest/umlaute_html_utf8.html
+++ b/test/parsertest/umlaute_html_utf8.html
@ -1,10 +1,11 @@
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
 <html>
 <head>
-<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
 </head>
 <body>
 In M&#252;nchen steht ein Hofbr&#228;uhaus.
 Dort gibt es Bier aus Ma&#223;kr&#252;gen.<br>
+<a href="http://localhost/umlaute_html_utf8.html">Example link in UTF-8 encoded HTML</a>
 </body>
 </html>
--- a/test/parsertest/umlaute_html_xml_txt_gnu.tar
+++ b/test/parsertest/umlaute_html_xml_txt_gnu.tar
--- a/test/parsertest/umlaute_html_xml_txt_pax.tar
+++ b/test/parsertest/umlaute_html_xml_txt_pax.tar
--- a/test/parsertest/umlaute_html_xml_txt_ustar.tar
+++ b/test/parsertest/umlaute_html_xml_txt_ustar.tar
--- a/test/parsertest/umlaute_html_xml_txt_v7.tar
+++ b/test/parsertest/umlaute_html_xml_txt_v7.tar
--- a/test/parsertest/umlaute_linux.txt
+++ b/test/parsertest/umlaute_linux.txt
@ -1 +1,2 @@
 In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen
+URL reference in raw text file : http://localhost/umlaute_linux.txt