diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index f1a43e0a2..938bfd58a 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -234,6 +234,9 @@ public class gzipParser extends AbstractParser implements Parser { Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes); if (docs != null) { maindoc.addSubDocuments(docs); + if(docs.length > 0 && docs[0].isPartiallyParsed()) { + maindoc.setPartiallyParsed(true); + } } } catch (final Exception e) { throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); diff --git a/test/java/net/yacy/document/parser/gzipParserTest.java b/test/java/net/yacy/document/parser/gzipParserTest.java index 4cfa990a4..ffca1c195 100644 --- a/test/java/net/yacy/document/parser/gzipParserTest.java +++ b/test/java/net/yacy/document/parser/gzipParserTest.java @@ -22,14 +22,20 @@ package net.yacy.document.parser; -import static org.junit.Assert.*; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertEquals; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Collection; import org.junit.Test; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; @@ -45,32 +51,220 @@ public class gzipParserTest { /** * Unit test for the gzipParser.parse() function with some small gz test files. - * @throws Failure when a file could not be parsed - * @throws InterruptedException when the test was interrupted before its termination - * @throws IOException when a read/write error occurred + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred */ @Test public void testParse() throws Failure, InterruptedException, IOException { - final String[] fileNames = { - "umlaute_html_utf8.html.gz", - "umlaute_linux.txt.gz" - }; + final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" }; final File folder = new File("test" + File.separator + "parsertest" + File.separator); gzipParser parser = new gzipParser(); - + + for (String fileName : fileNames) { + FileInputStream inStream = new FileInputStream(new File(folder, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + Document[] documents = parser.parse(location, "application/gzip", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertNotNull("Detected URLS must not be null for file " + fileName, anchors); + assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size()); + assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_")); + } finally { + inStream.close(); + } + } + } + + /** + * Testing parse integration with the tar parser on a test tgz archive. + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred + */ + @Test + public void testParseTgz() throws Failure, InterruptedException, IOException { + final String fileName = "umlaute_html_xml_txt_gnu.tgz"; + final File folder = new File("test" + File.separator + "parsertest" + File.separator); + gzipParser parser = new gzipParser(); + + FileInputStream inStream = new FileInputStream(new File(folder, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + Document[] documents = parser.parse(location, "application/gzip", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream); + + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertTrue(parsedText.contains("URL reference in raw text file")); + assertTrue(parsedText.contains("UTF-8 encoded XML test file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + + /** + * Unit test for the gzipParser.parseWithLimits() function with some small gz + * test files which content is within limits. + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred + */ + @Test + public void testParseWithLimits() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" }; + final File folder = new File("test" + File.separator + "parsertest" + File.separator); + gzipParser parser = new gzipParser(); + for (String fileName : fileNames) { FileInputStream inStream = new FileInputStream(new File(folder, fileName)); DigestURL location = new DigestURL("http://localhost/" + fileName); try { - Document[] documents = parser.parse(location, "application/gzip", null, new VocabularyScraper(), 0, - inStream); + Document[] documents = parser.parseWithLimits(location, "application/gzip", + StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000, + 10000); assertNotNull("Parser result must not be null for file " + fileName, documents); assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); - assertTrue("Parsed text must contain test word with umlaut char" + fileName, documents[0].getTextString().contains("Maßkrügen")); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertNotNull("Detected URLs must not be null for file " + fileName, anchors); + assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size()); + assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_")); + assertFalse("Parse document must not be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); } finally { inStream.close(); } } + + } + + /** + * Unit test for the gzipParser.parseWithLimits() when maxLinks limit is exceeded + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred + */ + @Test + public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" }; + final File folder = new File("test" + File.separator + "parsertest" + File.separator); + gzipParser parser = new gzipParser(); + + /* maxLinks limit exceeded */ + for (String fileName : fileNames) { + FileInputStream inStream = new FileInputStream(new File(folder, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + Document[] documents = parser.parseWithLimits(location, "application/gzip", + StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty()); + assertTrue("Parsed document must be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } finally { + inStream.close(); + } + } + } + + /** + * Unit test for the gzipParser.parseWithLimits() when maxBytes limit is exceeded + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred + */ + @Test + public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" }; + final File folder = new File("test" + File.separator + "parsertest" + File.separator); + gzipParser parser = new gzipParser(); + + String fileName = fileNames[0]; + FileInputStream inStream = new FileInputStream(new File(folder, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + /* The bytes limit is set to let parsing the beginning text part, but stop before reaching the tag */ + final long maxBytes = 258; + Document[] documents = parser.parseWithLimits(location, "application/gzip", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty()); + assertTrue("Parsed document must be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } finally { + inStream.close(); + } + + fileName = fileNames[1]; + inStream = new FileInputStream(new File(folder, fileName)); + location = new DigestURL("http://localhost/" + fileName); + try { + /* The bytes limit is set to let parsing the beginning of the text, but stop before reaching the URL */ + final long maxBytes = 65; + Document[] documents = parser.parseWithLimits(location, "application/gzip", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty()); + assertTrue("Parsed document must be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } finally { + inStream.close(); + } } } diff --git a/test/parsertest/umlaute_html_utf8.html.gz b/test/parsertest/umlaute_html_utf8.html.gz index b5de39ff7..37d814a14 100644 Binary files a/test/parsertest/umlaute_html_utf8.html.gz and b/test/parsertest/umlaute_html_utf8.html.gz differ diff --git a/test/parsertest/umlaute_html_xml_txt_gnu.tgz b/test/parsertest/umlaute_html_xml_txt_gnu.tgz new file mode 100644 index 000000000..b10791612 Binary files /dev/null and b/test/parsertest/umlaute_html_xml_txt_gnu.tgz differ diff --git a/test/parsertest/umlaute_linux.txt.gz b/test/parsertest/umlaute_linux.txt.gz index f576a9b1a..a86b3c5c8 100644 Binary files a/test/parsertest/umlaute_linux.txt.gz and b/test/parsertest/umlaute_linux.txt.gz differ