From acab6a6defb3307d27fb97004f99e0be4be8f55f Mon Sep 17 00:00:00 2001 From: luccioman Date: Mon, 14 Aug 2017 14:47:01 +0200 Subject: [PATCH 1/2] Also handle text content when parsing XML within limits. --- source/net/yacy/document/parser/GenericXMLParser.java | 10 ++++++++-- .../net/yacy/document/parser/GenericXMLParserTest.java | 8 ++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/document/parser/GenericXMLParser.java b/source/net/yacy/document/parser/GenericXMLParser.java index 0673260e6..25d429143 100644 --- a/source/net/yacy/document/parser/GenericXMLParser.java +++ b/source/net/yacy/document/parser/GenericXMLParser.java @@ -193,11 +193,17 @@ public class GenericXMLParser extends AbstractParser implements Parser { } catch(StreamLimitException e) { limitExceeded = true; } + + if (writer.isOverflow()) { + throw new Parser.Failure("Not enough Memory available for generic the XML parser : " + + Formatter.bytesToString(availableMemory), location); + } - /* create the parsed document with empty text content */ + /* Create the parsed document with eventually only partial part of the text and links */ + final byte[] contentBytes = UTF8.getBytes(writer.toString()); Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "", - null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) }; + null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) }; docs[0].setPartiallyParsed(limitExceeded); return docs; } catch (final Exception e) { diff --git a/test/java/net/yacy/document/parser/GenericXMLParserTest.java b/test/java/net/yacy/document/parser/GenericXMLParserTest.java index d4d6affe4..18b6cb438 100644 --- a/test/java/net/yacy/document/parser/GenericXMLParserTest.java +++ b/test/java/net/yacy/document/parser/GenericXMLParserTest.java @@ -390,6 +390,8 @@ public class GenericXMLParserTest { assertEquals(1, documents.length); assertFalse(documents[0].isPartiallyParsed()); + assertTrue(documents[0].getTextString().contains("And this is a relative link")); + Collection detectedAnchors = documents[0].getAnchors(); assertNotNull(detectedAnchors); assertEquals(5, detectedAnchors.size()); @@ -410,6 +412,9 @@ public class GenericXMLParserTest { assertEquals(1, documents.length); assertTrue(documents[0].isPartiallyParsed()); + assertTrue(documents[0].getTextString().contains("Home page")); + assertFalse(documents[0].getTextString().contains("And this is a relative link")); + Collection detectedAnchors = documents[0].getAnchors(); assertNotNull(detectedAnchors); assertEquals(2, detectedAnchors.size()); @@ -447,6 +452,9 @@ public class GenericXMLParserTest { assertEquals(1, documents.length); assertTrue(documents[0].isPartiallyParsed()); + assertTrue(documents[0].getTextString().contains("and this is a mention to a relative URL")); + assertFalse(documents[0].getTextString().contains("And this is a relative link to another")); + Collection detectedAnchors = documents[0].getAnchors(); assertNotNull(detectedAnchors); assertEquals(3, detectedAnchors.size()); From 780173008e5757572b9675f397b26a8d597ec3f4 Mon Sep 17 00:00:00 2001 From: luccioman Date: Mon, 14 Aug 2017 14:57:58 +0200 Subject: [PATCH 2/2] Implemented partial stream parsing of tar archives. Also added JUnit tests for the tar parser and fixed unwanted use of the tar parser as a fallback on files included in a tar archive. --- .../net/yacy/document/parser/tarParser.java | 187 +++++++++++-- .../yacy/document/parser/tarParserTest.java | 253 ++++++++++++++++++ test/parsertest/umlaute_dc_xml_iso.xml | 1 + test/parsertest/umlaute_dc_xml_utf8.xml | 1 + test/parsertest/umlaute_html_iso.html | 1 + test/parsertest/umlaute_html_utf8.html | 3 +- test/parsertest/umlaute_html_xml_txt_gnu.tar | Bin 0 -> 10240 bytes test/parsertest/umlaute_html_xml_txt_pax.tar | Bin 0 -> 20480 bytes .../parsertest/umlaute_html_xml_txt_ustar.tar | Bin 0 -> 10240 bytes test/parsertest/umlaute_html_xml_txt_v7.tar | Bin 0 -> 10240 bytes test/parsertest/umlaute_linux.txt | 1 + 11 files changed, 420 insertions(+), 27 deletions(-) create mode 100644 test/java/net/yacy/document/parser/tarParserTest.java create mode 100644 test/parsertest/umlaute_html_xml_txt_gnu.tar create mode 100644 test/parsertest/umlaute_html_xml_txt_pax.tar create mode 100644 test/parsertest/umlaute_html_xml_txt_ustar.tar create mode 100644 test/parsertest/umlaute_html_xml_txt_v7.tar diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 815497beb..d658364a4 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -29,11 +29,14 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; +import java.net.MalformedURLException; import java.util.Date; import java.util.zip.GZIPInputStream; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; + import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; @@ -43,9 +46,6 @@ import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; -import org.apache.commons.compress.archivers.tar.TarArchiveEntry; -import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; - // this is a new implementation of this parser idiom using multiple documents as result set /** * Parses the tar file and each contained file, @@ -75,6 +75,8 @@ public class tarParser extends AbstractParser implements Parser { final String filename = location.getFileName(); final String ext = MultiProtocolURL.getFileExtension(filename); + final DigestURL parentTarURL = createParentTarURL(location); + // TODO is this hack really useful ? These extensions are already handled by the gzipParser if (ext.equals("gz") || ext.equals("tgz")) { try { source = new GZIPInputStream(source); @@ -85,26 +87,8 @@ public class tarParser extends AbstractParser implements Parser { TarArchiveEntry entry; final TarArchiveInputStream tis = new TarArchiveInputStream(source); - // create maindoc for this bzip container - final Document maindoc = new Document( - location, - mimeType, - charset, - this, - null, - null, - AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title - null, - null, - null, - null, - 0.0d, 0.0d, - (Object) null, - null, - null, - null, - false, - new Date()); + // create maindoc for this tar container + final Document maindoc = createMainDocument(location, mimeType, charset, this); // loop through the elements in the tar file and parse every single file inside while (true) { try { @@ -118,8 +102,18 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - final Document[] subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp); - if (subDocs == null) continue; + /* + * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. + * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name. + * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the + * extension of the URL is still ".tar", thus incorrectly making the tar parser + * as a possible parser for the sub resource. + */ + final DigestURL subLocation = new DigestURL(parentTarURL, name); + final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, scraper, timezoneOffset, 999, tmp); + if (subDocs == null) { + continue; + } maindoc.addSubDocuments(subDocs); } catch (final Parser.Failure e) { AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); @@ -134,6 +128,147 @@ public class tarParser extends AbstractParser implements Parser { return new Document[]{maindoc}; } + @Override + public boolean isParseWithLimitsSupported() { + return true; + } + + @Override + public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, + final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks, + final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException { + + final DigestURL parentTarURL = createParentTarURL(location); + + final TarArchiveInputStream tis = new TarArchiveInputStream(source); + + // create maindoc for this tar container + final Document maindoc = createMainDocument(location, mimeType, charset, this); + + // loop through the elements in the tar file and parse every single file inside + TarArchiveEntry entry; + int totalProcessedLinks = 0; + while (true) { + try { + entry = tis.getNextTarEntry(); + if (entry == null) { + break; + } + + /* + * We are here sure at least one entry has still to be processed : let's check + * now the bytes limit as sub parsers applied on eventual previous entries may + * not support partial parsing and would have thrown a Parser.Failure instead of + * marking the document as partially parsed. + */ + if (tis.getBytesRead() >= maxBytes) { + maindoc.setPartiallyParsed(true); + break; + } + + if (entry.isDirectory() || entry.getSize() <= 0) { + continue; + } + final String name = entry.getName(); + final int idx = name.lastIndexOf('.'); + final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : ""); + try { + /* + * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on + * compressed content + */ + + /* + * Create an appropriate sub location to prevent unwanted fallback to the + * tarparser on resources included in the archive. We use the tar file name as + * the parent sub path. Example : http://host/archive.tar/name. Indeed if we + * create a sub location with a '#' separator such as + * http://host/archive.tar#name, the extension of the URL is still ".tar", thus + * incorrectly making the tar parser as a possible parser for the sub resource. + */ + final DigestURL subLocation = new DigestURL(parentTarURL, name); + final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999, + entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead()); + + /* + * If the parser(s) did not consume all bytes in the entry, these ones will be + * skipped by the next call to getNextTarEntry() + */ + if (subDocs == null) { + continue; + } + maindoc.addSubDocuments(subDocs); + for (Document subDoc : subDocs) { + if (subDoc.getAnchors() != null) { + totalProcessedLinks += subDoc.getAnchors().size(); + } + } + /* + * Check if a limit has been exceeded (we are sure to pass here when maxLinks + * has been exceeded as this limit require parser support for partial parsing to + * be detected) + */ + if (subDocs[0].isPartiallyParsed()) { + maindoc.setPartiallyParsed(true); + break; + } + } catch (final Parser.Failure e) { + AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); + } + } catch (final IOException e) { + AbstractParser.log.warn("tar parser:" + e.getMessage()); + break; + } + } + return new Document[] { maindoc }; + } + + /** + * Generate a parent URL to use for generating sub URLs on tar archive entries. + * + * @param tarURL + * the URL of the tar archive + * @return an URL ending with a "/" suitable as a base URL for archive entries + */ + private DigestURL createParentTarURL(final DigestURL tarURL) { + String locationStr = tarURL.toNormalform(false); + if (!locationStr.endsWith("/")) { + locationStr += "/"; + } + DigestURL parentTarURL; + try { + parentTarURL = new DigestURL(locationStr); + } catch (MalformedURLException e1) { + /* This should not happen */ + parentTarURL = tarURL; + } + return parentTarURL; + } + + /** + * Create the main resulting parsed document for a tar container + * + * @param location + * the parsed resource URL + * @param mimeType + * the media type of the resource + * @param charset + * the charset name if known + * @param an + * instance of tarParser that is registered as the parser origin of + * the document + * @return a Document instance + */ + public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, + final tarParser parser) { + final String filename = location.getFileName(); + final Document maindoc = new Document(location, mimeType, charset, parser, null, null, + AbstractParser + .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title + null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date()); + return maindoc; + } + public final static boolean isTar(File f) { if (!f.exists() || f.length() < 0x105) return false; RandomAccessFile raf = null; diff --git a/test/java/net/yacy/document/parser/tarParserTest.java b/test/java/net/yacy/document/parser/tarParserTest.java new file mode 100644 index 000000000..124ac5ffd --- /dev/null +++ b/test/java/net/yacy/document/parser/tarParserTest.java @@ -0,0 +1,253 @@ +// tarParserTest.java +// --------------------------- +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.document.parser; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileInputStream; +import java.util.Collection; + +import org.junit.Before; +import org.junit.Test; + +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.document.Document; +import net.yacy.document.VocabularyScraper; + +/** + * Unit tests for the {@link tarParser} class + * + * @author luccioman + * + */ +public class tarParserTest { + + /** The test resources folder */ + private final static File TEST_FOLDER = new File("test" + File.separator + "parsertest" + File.separator); + + /** + * All these test archives include two html test files in a sub folder, then a + * xml and a text test files at the root + */ + private static final String[] TAR_FILE_NAMES = { "umlaute_html_xml_txt_gnu.tar", // created with tar option + // --format=gnu + "umlaute_html_xml_txt_pax.tar", // created with tar option --format=pax + "umlaute_html_xml_txt_ustar.tar", // created with tar option --format=ustar + "umlaute_html_xml_txt_v7.tar", // created with tar option --format=v7 + }; + + /** Tar parser test instance */ + private tarParser parser; + + @Before + public void setUp() { + this.parser = new tarParser(); + } + + /** + * Unit test for the tarParser.parse() implementation with some test archives in + * various common tar formats. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParse() throws Exception { + + for (String fileName : TAR_FILE_NAMES) { + FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + Document[] documents = this.parser.parse(location, "application/tar", null, new VocabularyScraper(), 0, + inStream); + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertTrue(parsedText.contains("URL reference in raw text file")); + assertTrue(parsedText.contains("UTF-8 encoded XML test file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + } + + /** + * Test tarParser.parseWithLimits() with limits not reached. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseWithLimitsNotReached() throws Exception { + for (String fileName : TAR_FILE_NAMES) { + + FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + /* Content within limits */ + try { + Document[] documents = this.parser.parseWithLimits(location, "application/tar", null, + new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, Long.MAX_VALUE); + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertTrue(parsedText.contains("UTF-8 encoded XML test file")); + assertTrue(parsedText.contains("URL reference in raw text file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must contain all URLs from each test file included in the archive", 5, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + } + + /** + * Test tarParser.parseWithLimits() with links limit exceeded + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseWithLimitsLinksExceeded() throws Exception { + for (String fileName : TAR_FILE_NAMES) { + + FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + + /* Links limit exceeded from the third included file */ + try { + Document[] documents = this.parser.parseWithLimits(location, "application/tar", null, + new VocabularyScraper(), 0, inStream, 2, Long.MAX_VALUE); + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertFalse(parsedText.contains("UTF-8 encoded XML test file")); + assertFalse(parsedText.contains("URL reference in raw text file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must only contain URLs from test files withing links limit", 2, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + } + + /** + * Test tarParser.parseWithLimits() with bytes limit exceeded + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseWithLimitsBytesExceeded() throws Exception { + for (String fileName : TAR_FILE_NAMES) { + + FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + + /* Bytes limit exceeded from the third included file. */ + final long maxBytes; + if ("umlaute_html_xml_txt_pax.tar".equals(fileName)) { + /* pax tar format uses more bytes for extended headers */ + maxBytes = 7000; + } else { + /* + * Limit calculation : five 512 bytes tar records = 512 bytes tar header for the + * html directory + (2 x (512 bytes tar header + html file content below 512 + * bytes, thus rounded to 512)) + */ + maxBytes = 512 * 5; + } + try { + Document[] documents = this.parser.parseWithLimits(location, "application/tar", null, + new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes); + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertFalse(parsedText.contains("URL reference in raw text file")); + assertFalse(parsedText.contains("UTF-8 encoded XML test file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must only contain URLs from test files withing bytes limit", 2, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + } +} diff --git a/test/parsertest/umlaute_dc_xml_iso.xml b/test/parsertest/umlaute_dc_xml_iso.xml index 3524be737..897862eec 100644 --- a/test/parsertest/umlaute_dc_xml_iso.xml +++ b/test/parsertest/umlaute_dc_xml_iso.xml @@ -3,6 +3,7 @@ + ISO-8859-1 encoded XML test file In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen \ No newline at end of file diff --git a/test/parsertest/umlaute_dc_xml_utf8.xml b/test/parsertest/umlaute_dc_xml_utf8.xml index 71744f3f1..785d80005 100644 --- a/test/parsertest/umlaute_dc_xml_utf8.xml +++ b/test/parsertest/umlaute_dc_xml_utf8.xml @@ -3,6 +3,7 @@ + UTF-8 encoded XML test file In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen \ No newline at end of file diff --git a/test/parsertest/umlaute_html_iso.html b/test/parsertest/umlaute_html_iso.html index de56c7116..1e18fde75 100644 --- a/test/parsertest/umlaute_html_iso.html +++ b/test/parsertest/umlaute_html_iso.html @@ -6,5 +6,6 @@ In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.
+Example link in ISO-8859-1 encoded HTML diff --git a/test/parsertest/umlaute_html_utf8.html b/test/parsertest/umlaute_html_utf8.html index 8954c5c6a..9fcbf9175 100644 --- a/test/parsertest/umlaute_html_utf8.html +++ b/test/parsertest/umlaute_html_utf8.html @@ -1,10 +1,11 @@ - + In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.
+Example link in UTF-8 encoded HTML diff --git a/test/parsertest/umlaute_html_xml_txt_gnu.tar b/test/parsertest/umlaute_html_xml_txt_gnu.tar new file mode 100644 index 0000000000000000000000000000000000000000..e454d3ea4b03789d6351d91cfccece71f4589d3e GIT binary patch literal 10240 zcmeHK%}(1u5Y9PIu{Bapwb$#|asH7?Kp>HVAgJYp9D9>kaqM)xCgEv%>01=3hdf9x zz4rw;bk@OX5NRSHv{c=dEZaM?v*T~(+Zp%yFu=Lz9Xbr#vT|@EwxuuLg-DDXDG<{% zOq1A!oKAEC>d1Otw`SIpm^%!iT#&dcw_7J)x%lM@`y&3yFmMu1FT~45!n-9+JZ;Qr z5#XX}F2+5}10m8a)4Gkn1(6r=H!a=FA$?;WtT6PS$G`Tfd9>3$Ki)Nn1`h~p8O26k`OtRt+4+|Pke_Y4^qHSmOUpL__ zfF8gmy5#@Ukza42whhrnmz2d&)$0n;bY+&>e%PxkvzojO6>Z)oFt1xVY3`xZ^V9Zj z3uU#_OFB>#-H1^wVm)j?%rP;rPE^kw(pZIERF8WaT!P-26_^g=3cSsW@OV7d#)Z@f zDVNJwH?U!-prOWBA$KNf7;h1+@!I~ z{E_Hm#2n0eZiV|ippCl){R4DFoW>k={eaf+Tx-6_Td*V=mFe};Lu&dtJ*D0t?_Prf zO+?F?ej2dpH5^mv9*ahW%UFPQ$LHA;M4_=hz-u%z{zLGRiN+^n0Lc%b%SbB50;5gKn6L-k3RKAkSn*9#l#H3YIq6 z^VG{f)Oo2d5s(N-1SA3y0f~S_Kq4R!kO)WwBmxoviGV~vA|Mfv2uK7Z0ulj{7I?VbWfVqzh(luFdvd757O7G0o+ zJV-CS_XYOQp&U7m5_b1!j4|$erj^i>q6r;Gc(o2WS zv8i%eGP#Vwr`x$rY^=@#>AuQqiu648kFn5`R* zXIh@8!8@{T+x0qau=-7_b!`or)~fAmNA7jZPqLjp@l+(~UE7wtXRclHzbd6kUUmM% z5^E9riTx_OulOGq!JWh1XP?gFdULz8vzKEm-+x^Kic#zrf*(oaQ6Ww`+ESt*p?TQ; zZ!-`B*#8~f)N%iRj#X0I|8385HP@*2@Y?=AwVKZ2{lC`sIlRX)mPMfC|BCru_i2v* z)xKYq<*sR4_SOD>&41l$k!LxR{m=LR?sxqUn}bjLTV#8%w@da9-|g;f5=Eu-Cv%h1 z{-8fKSenkrAoG(v5>cA?F{N8SDjV&tTtKpb5CV!37d{yZajx=Tis*As3DQLHM5yB8 zoGaw&uBQk-5oM&YPJ*GI<(soV<&8MxdJP`SKxi{Y*X#{habd>mFw?61bZ2 zlb#|SUZXTl13w<7xuA;)p=@kT{Ly*LNgO3-1Y9V6COiq!kcTBuUFypzOnrx`l<3q6 z@q+G4{@tFGz3g}t48hOd7R)%O%2zcQ7!C^_vZ)0q4%M6|OsK^C5@ zeeAy{2cHfGTYIF6ox|c7g5n^}xRz$8)PUDYnL%}?hQWyEDvY9rBGlj~<-O_!s*`*T z?q)&wcs$m|X4whzJdf%IH4GJ=Q1gpK_!BkB-zaTTUDFQ2VAlS;$l|hf7*HPb5hNdS z$}~n{SVGEy`#cY_=v?M8WF6Fo!J3Ff%r|ZZ^glyJMDSdYlPKn0I(ur?D4gp_CMqt! zEIg!K{#Kq+ejs5ZK>0WNb>)#@4p7C^8Z%Fe`dL;|4k%UU-S3xf2=~dwD#f0T>{B;oqQ}HBgHVAgJYp9D9>kaqM)xCgEv%>01=3hdf9x zz4rw;bk@dc5NRSHv{c=dthIM%XJ@~eZ)ZH%Ow_7)#o+CM2a-fyUUe4q|_&*-`cFgI8__&C9 zudIo;jTtHMD4OP?-?KPKP_s*UcQ#H)g>KMgO_~>#tfzJDv06U9{h6 zAEM*4w}%HiNKr9Jr%6i%lci={Q>3~Uxr-Z<;*sb!%N-|Fgy?o~5?5TN$! zEyFTufis{1iZ~r`M7;p*hrKR?DWhDA2X-83`Bum{>U&+7kI-9>G6ch@Z4047Z7766 zPoD-_y~|)HFqP3>LlFj7G4?~p_6K3ae?thRxjVH-W1k}53x)_xNNh$la6*^52~c%x zizB4A!%-4+R)pW*o)QM$J+2ppo29?t{c8RfjO_j|7J&FVu}W|aNdEtU;CMOwKR7*7 z%Vn#glI54c%~Jk7~0X=r;vzoDc#c^fj?yi8zjuX@tj zL#OAbo!vIda;F=2At^c`qgu%N*Z`knVql%9t}~*M3cIK-cQtqfy)!K^97I+4nkV7O zWTH(9i4sz&RIqMf!%#s&jjjT2Pt_pWQt~KksRGxTuOG+EPgc7Qram1(@eyI75e(ZR zlx*Chk;A;PsAI$!jJi&hd)%kZy9xaRR79Lc9Q8b(*700wzQ|qBBnp+;_0mgf_Bpww z-XQm0fdWND+n#+Ive`9UQ^_8SN`=W-fOW^_bO^lAT<_pD5*h!&dGS=^Q@%$2G1G$o ztN#BClGXoaNr!&{otVTdN&Y_u>xZuS@#X6#0$0nQog5-4rYVDJn2Oq%*%KhoG$jwJ zCVvG>i|l#m%55~{e4^6N zv3J!_oG{=t;3~hKPzBxnH55)~9E+(nVQQkQ3?yF^?5#Q61d7}zo7_+s~G#CWBbD};(sB8(rnM{@x-Ud_ks}u7ZRTl4V=)WZUR&t+hPd| z-(e{UIvK=iTPlQu_m}HMVYc*7DDf!&3r2eW7YjO^6Tp9H(6-jR6}>6%({?LHvRC@= z@c;1aSS^>Wib~cW0=H}V|8oQNe~j4eQwg6k;341t`1oIweq|e_so!-jXKC_>j}?iR ze+o)G#{Xiel;(fkgtI^=hNTn9|Cb}bSwnpnqK&R7iy*5v6r$Dp)tLVW=RXM%MwiXKE1bD0!4V3k9yToIi<~pUie0Ono|r z;v>RDBN)sglo)Q&$YI_@)G?wDdR?c=J?_)y{e=DrDk4rJj{2TY>v$=(oaC-(5{1hA zX6+$0|C*dq?~wbTK!GBnW6!^g*!%{Lsl>*rQsFWdVBPb1(FIXxZVvDUiIo2kym+SZ z8Q&oPoMFNL$L@ayN$Y>Jq{Cf6CvX?kCI6p;^;75k`u07GjjQF)Pmd53)09CqOhs+X z>?x3Ewjd9xCbxpMMfP&*<)7-jl$Qud1SA3y0f~S_Kq4R!kO)WwBmxoviGV~vA|Mfv S2uK7Z0ulj1$iSrC>0 literal 0 HcmV?d00001 diff --git a/test/parsertest/umlaute_linux.txt b/test/parsertest/umlaute_linux.txt index 62bb22c22..77033cbc9 100755 --- a/test/parsertest/umlaute_linux.txt +++ b/test/parsertest/umlaute_linux.txt @@ -1 +1,2 @@ In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen +URL reference in raw text file : http://localhost/umlaute_linux.txt