diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 815497beb..d658364a4 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -29,11 +29,14 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; +import java.net.MalformedURLException; import java.util.Date; import java.util.zip.GZIPInputStream; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; + import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; @@ -43,9 +46,6 @@ import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; -import org.apache.commons.compress.archivers.tar.TarArchiveEntry; -import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; - // this is a new implementation of this parser idiom using multiple documents as result set /** * Parses the tar file and each contained file, @@ -75,6 +75,8 @@ public class tarParser extends AbstractParser implements Parser { final String filename = location.getFileName(); final String ext = MultiProtocolURL.getFileExtension(filename); + final DigestURL parentTarURL = createParentTarURL(location); + // TODO is this hack really useful ? These extensions are already handled by the gzipParser if (ext.equals("gz") || ext.equals("tgz")) { try { source = new GZIPInputStream(source); @@ -85,26 +87,8 @@ public class tarParser extends AbstractParser implements Parser { TarArchiveEntry entry; final TarArchiveInputStream tis = new TarArchiveInputStream(source); - // create maindoc for this bzip container - final Document maindoc = new Document( - location, - mimeType, - charset, - this, - null, - null, - AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title - null, - null, - null, - null, - 0.0d, 0.0d, - (Object) null, - null, - null, - null, - false, - new Date()); + // create maindoc for this tar container + final Document maindoc = createMainDocument(location, mimeType, charset, this); // loop through the elements in the tar file and parse every single file inside while (true) { try { @@ -118,8 +102,18 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - final Document[] subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp); - if (subDocs == null) continue; + /* + * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. + * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name. + * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the + * extension of the URL is still ".tar", thus incorrectly making the tar parser + * as a possible parser for the sub resource. + */ + final DigestURL subLocation = new DigestURL(parentTarURL, name); + final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, scraper, timezoneOffset, 999, tmp); + if (subDocs == null) { + continue; + } maindoc.addSubDocuments(subDocs); } catch (final Parser.Failure e) { AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); @@ -134,6 +128,147 @@ public class tarParser extends AbstractParser implements Parser { return new Document[]{maindoc}; } + @Override + public boolean isParseWithLimitsSupported() { + return true; + } + + @Override + public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, + final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks, + final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException { + + final DigestURL parentTarURL = createParentTarURL(location); + + final TarArchiveInputStream tis = new TarArchiveInputStream(source); + + // create maindoc for this tar container + final Document maindoc = createMainDocument(location, mimeType, charset, this); + + // loop through the elements in the tar file and parse every single file inside + TarArchiveEntry entry; + int totalProcessedLinks = 0; + while (true) { + try { + entry = tis.getNextTarEntry(); + if (entry == null) { + break; + } + + /* + * We are here sure at least one entry has still to be processed : let's check + * now the bytes limit as sub parsers applied on eventual previous entries may + * not support partial parsing and would have thrown a Parser.Failure instead of + * marking the document as partially parsed. + */ + if (tis.getBytesRead() >= maxBytes) { + maindoc.setPartiallyParsed(true); + break; + } + + if (entry.isDirectory() || entry.getSize() <= 0) { + continue; + } + final String name = entry.getName(); + final int idx = name.lastIndexOf('.'); + final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : ""); + try { + /* + * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on + * compressed content + */ + + /* + * Create an appropriate sub location to prevent unwanted fallback to the + * tarparser on resources included in the archive. We use the tar file name as + * the parent sub path. Example : http://host/archive.tar/name. Indeed if we + * create a sub location with a '#' separator such as + * http://host/archive.tar#name, the extension of the URL is still ".tar", thus + * incorrectly making the tar parser as a possible parser for the sub resource. + */ + final DigestURL subLocation = new DigestURL(parentTarURL, name); + final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999, + entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead()); + + /* + * If the parser(s) did not consume all bytes in the entry, these ones will be + * skipped by the next call to getNextTarEntry() + */ + if (subDocs == null) { + continue; + } + maindoc.addSubDocuments(subDocs); + for (Document subDoc : subDocs) { + if (subDoc.getAnchors() != null) { + totalProcessedLinks += subDoc.getAnchors().size(); + } + } + /* + * Check if a limit has been exceeded (we are sure to pass here when maxLinks + * has been exceeded as this limit require parser support for partial parsing to + * be detected) + */ + if (subDocs[0].isPartiallyParsed()) { + maindoc.setPartiallyParsed(true); + break; + } + } catch (final Parser.Failure e) { + AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); + } + } catch (final IOException e) { + AbstractParser.log.warn("tar parser:" + e.getMessage()); + break; + } + } + return new Document[] { maindoc }; + } + + /** + * Generate a parent URL to use for generating sub URLs on tar archive entries. + * + * @param tarURL + * the URL of the tar archive + * @return an URL ending with a "/" suitable as a base URL for archive entries + */ + private DigestURL createParentTarURL(final DigestURL tarURL) { + String locationStr = tarURL.toNormalform(false); + if (!locationStr.endsWith("/")) { + locationStr += "/"; + } + DigestURL parentTarURL; + try { + parentTarURL = new DigestURL(locationStr); + } catch (MalformedURLException e1) { + /* This should not happen */ + parentTarURL = tarURL; + } + return parentTarURL; + } + + /** + * Create the main resulting parsed document for a tar container + * + * @param location + * the parsed resource URL + * @param mimeType + * the media type of the resource + * @param charset + * the charset name if known + * @param an + * instance of tarParser that is registered as the parser origin of + * the document + * @return a Document instance + */ + public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, + final tarParser parser) { + final String filename = location.getFileName(); + final Document maindoc = new Document(location, mimeType, charset, parser, null, null, + AbstractParser + .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title + null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date()); + return maindoc; + } + public final static boolean isTar(File f) { if (!f.exists() || f.length() < 0x105) return false; RandomAccessFile raf = null; diff --git a/test/java/net/yacy/document/parser/tarParserTest.java b/test/java/net/yacy/document/parser/tarParserTest.java new file mode 100644 index 000000000..124ac5ffd --- /dev/null +++ b/test/java/net/yacy/document/parser/tarParserTest.java @@ -0,0 +1,253 @@ +// tarParserTest.java +// --------------------------- +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.document.parser; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileInputStream; +import java.util.Collection; + +import org.junit.Before; +import org.junit.Test; + +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.document.Document; +import net.yacy.document.VocabularyScraper; + +/** + * Unit tests for the {@link tarParser} class + * + * @author luccioman + * + */ +public class tarParserTest { + + /** The test resources folder */ + private final static File TEST_FOLDER = new File("test" + File.separator + "parsertest" + File.separator); + + /** + * All these test archives include two html test files in a sub folder, then a + * xml and a text test files at the root + */ + private static final String[] TAR_FILE_NAMES = { "umlaute_html_xml_txt_gnu.tar", // created with tar option + // --format=gnu + "umlaute_html_xml_txt_pax.tar", // created with tar option --format=pax + "umlaute_html_xml_txt_ustar.tar", // created with tar option --format=ustar + "umlaute_html_xml_txt_v7.tar", // created with tar option --format=v7 + }; + + /** Tar parser test instance */ + private tarParser parser; + + @Before + public void setUp() { + this.parser = new tarParser(); + } + + /** + * Unit test for the tarParser.parse() implementation with some test archives in + * various common tar formats. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParse() throws Exception { + + for (String fileName : TAR_FILE_NAMES) { + FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + Document[] documents = this.parser.parse(location, "application/tar", null, new VocabularyScraper(), 0, + inStream); + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertTrue(parsedText.contains("URL reference in raw text file")); + assertTrue(parsedText.contains("UTF-8 encoded XML test file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + } + + /** + * Test tarParser.parseWithLimits() with limits not reached. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseWithLimitsNotReached() throws Exception { + for (String fileName : TAR_FILE_NAMES) { + + FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + /* Content within limits */ + try { + Document[] documents = this.parser.parseWithLimits(location, "application/tar", null, + new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, Long.MAX_VALUE); + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertTrue(parsedText.contains("UTF-8 encoded XML test file")); + assertTrue(parsedText.contains("URL reference in raw text file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must contain all URLs from each test file included in the archive", 5, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + } + + /** + * Test tarParser.parseWithLimits() with links limit exceeded + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseWithLimitsLinksExceeded() throws Exception { + for (String fileName : TAR_FILE_NAMES) { + + FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + + /* Links limit exceeded from the third included file */ + try { + Document[] documents = this.parser.parseWithLimits(location, "application/tar", null, + new VocabularyScraper(), 0, inStream, 2, Long.MAX_VALUE); + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertFalse(parsedText.contains("UTF-8 encoded XML test file")); + assertFalse(parsedText.contains("URL reference in raw text file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must only contain URLs from test files withing links limit", 2, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + } + + /** + * Test tarParser.parseWithLimits() with bytes limit exceeded + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseWithLimitsBytesExceeded() throws Exception { + for (String fileName : TAR_FILE_NAMES) { + + FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + + /* Bytes limit exceeded from the third included file. */ + final long maxBytes; + if ("umlaute_html_xml_txt_pax.tar".equals(fileName)) { + /* pax tar format uses more bytes for extended headers */ + maxBytes = 7000; + } else { + /* + * Limit calculation : five 512 bytes tar records = 512 bytes tar header for the + * html directory + (2 x (512 bytes tar header + html file content below 512 + * bytes, thus rounded to 512)) + */ + maxBytes = 512 * 5; + } + try { + Document[] documents = this.parser.parseWithLimits(location, "application/tar", null, + new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes); + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertFalse(parsedText.contains("URL reference in raw text file")); + assertFalse(parsedText.contains("UTF-8 encoded XML test file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must only contain URLs from test files withing bytes limit", 2, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertFalse(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + } +} diff --git a/test/parsertest/umlaute_dc_xml_iso.xml b/test/parsertest/umlaute_dc_xml_iso.xml index 3524be737..897862eec 100644 --- a/test/parsertest/umlaute_dc_xml_iso.xml +++ b/test/parsertest/umlaute_dc_xml_iso.xml @@ -3,6 +3,7 @@ + ISO-8859-1 encoded XML test file In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen \ No newline at end of file diff --git a/test/parsertest/umlaute_dc_xml_utf8.xml b/test/parsertest/umlaute_dc_xml_utf8.xml index 71744f3f1..785d80005 100644 --- a/test/parsertest/umlaute_dc_xml_utf8.xml +++ b/test/parsertest/umlaute_dc_xml_utf8.xml @@ -3,6 +3,7 @@ + UTF-8 encoded XML test file In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen \ No newline at end of file diff --git a/test/parsertest/umlaute_html_iso.html b/test/parsertest/umlaute_html_iso.html index de56c7116..1e18fde75 100644 --- a/test/parsertest/umlaute_html_iso.html +++ b/test/parsertest/umlaute_html_iso.html @@ -6,5 +6,6 @@ In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.
+Example link in ISO-8859-1 encoded HTML diff --git a/test/parsertest/umlaute_html_utf8.html b/test/parsertest/umlaute_html_utf8.html index 8954c5c6a..9fcbf9175 100644 --- a/test/parsertest/umlaute_html_utf8.html +++ b/test/parsertest/umlaute_html_utf8.html @@ -1,10 +1,11 @@ - + In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.
+Example link in UTF-8 encoded HTML diff --git a/test/parsertest/umlaute_html_xml_txt_gnu.tar b/test/parsertest/umlaute_html_xml_txt_gnu.tar new file mode 100644 index 000000000..e454d3ea4 Binary files /dev/null and b/test/parsertest/umlaute_html_xml_txt_gnu.tar differ diff --git a/test/parsertest/umlaute_html_xml_txt_pax.tar b/test/parsertest/umlaute_html_xml_txt_pax.tar new file mode 100644 index 000000000..3293be4ce Binary files /dev/null and b/test/parsertest/umlaute_html_xml_txt_pax.tar differ diff --git a/test/parsertest/umlaute_html_xml_txt_ustar.tar b/test/parsertest/umlaute_html_xml_txt_ustar.tar new file mode 100644 index 000000000..fb7c004e7 Binary files /dev/null and b/test/parsertest/umlaute_html_xml_txt_ustar.tar differ diff --git a/test/parsertest/umlaute_html_xml_txt_v7.tar b/test/parsertest/umlaute_html_xml_txt_v7.tar new file mode 100644 index 000000000..6009dc8b5 Binary files /dev/null and b/test/parsertest/umlaute_html_xml_txt_v7.tar differ diff --git a/test/parsertest/umlaute_linux.txt b/test/parsertest/umlaute_linux.txt index 62bb22c22..77033cbc9 100755 --- a/test/parsertest/umlaute_linux.txt +++ b/test/parsertest/umlaute_linux.txt @@ -1 +1,2 @@ In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen +URL reference in raw text file : http://localhost/umlaute_linux.txt