From c6ae87168a6eef116e96fb7907568a2f2435bea7 Mon Sep 17 00:00:00 2001 From: luccioman Date: Tue, 22 Aug 2017 14:13:00 +0200 Subject: [PATCH] Added unit tests on the gzip parser. --- .../net/yacy/document/parser/gzipParser.java | 3 + .../yacy/document/parser/gzipParserTest.java | 218 +++++++++++++++++- test/parsertest/umlaute_html_utf8.html.gz | Bin 257 -> 309 bytes test/parsertest/umlaute_html_xml_txt_gnu.tgz | Bin 0 -> 868 bytes test/parsertest/umlaute_linux.txt.gz | Bin 109 -> 165 bytes 5 files changed, 209 insertions(+), 12 deletions(-) create mode 100644 test/parsertest/umlaute_html_xml_txt_gnu.tgz diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index f1a43e0a2..938bfd58a 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -234,6 +234,9 @@ public class gzipParser extends AbstractParser implements Parser { Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes); if (docs != null) { maindoc.addSubDocuments(docs); + if(docs.length > 0 && docs[0].isPartiallyParsed()) { + maindoc.setPartiallyParsed(true); + } } } catch (final Exception e) { throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); diff --git a/test/java/net/yacy/document/parser/gzipParserTest.java b/test/java/net/yacy/document/parser/gzipParserTest.java index 4cfa990a4..ffca1c195 100644 --- a/test/java/net/yacy/document/parser/gzipParserTest.java +++ b/test/java/net/yacy/document/parser/gzipParserTest.java @@ -22,14 +22,20 @@ package net.yacy.document.parser; -import static org.junit.Assert.*; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertEquals; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Collection; import org.junit.Test; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; @@ -45,32 +51,220 @@ public class gzipParserTest { /** * Unit test for the gzipParser.parse() function with some small gz test files. - * @throws Failure when a file could not be parsed - * @throws InterruptedException when the test was interrupted before its termination - * @throws IOException when a read/write error occurred + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred */ @Test public void testParse() throws Failure, InterruptedException, IOException { - final String[] fileNames = { - "umlaute_html_utf8.html.gz", - "umlaute_linux.txt.gz" - }; + final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" }; final File folder = new File("test" + File.separator + "parsertest" + File.separator); gzipParser parser = new gzipParser(); - + + for (String fileName : fileNames) { + FileInputStream inStream = new FileInputStream(new File(folder, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + Document[] documents = parser.parse(location, "application/gzip", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertNotNull("Detected URLS must not be null for file " + fileName, anchors); + assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size()); + assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_")); + } finally { + inStream.close(); + } + } + } + + /** + * Testing parse integration with the tar parser on a test tgz archive. + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred + */ + @Test + public void testParseTgz() throws Failure, InterruptedException, IOException { + final String fileName = "umlaute_html_xml_txt_gnu.tgz"; + final File folder = new File("test" + File.separator + "parsertest" + File.separator); + gzipParser parser = new gzipParser(); + + FileInputStream inStream = new FileInputStream(new File(folder, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + Document[] documents = parser.parse(location, "application/gzip", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream); + + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertTrue(parsedText.contains("URL reference in raw text file")); + assertTrue(parsedText.contains("UTF-8 encoded XML test file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } finally { + inStream.close(); + } + } + + /** + * Unit test for the gzipParser.parseWithLimits() function with some small gz + * test files which content is within limits. + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred + */ + @Test + public void testParseWithLimits() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" }; + final File folder = new File("test" + File.separator + "parsertest" + File.separator); + gzipParser parser = new gzipParser(); + for (String fileName : fileNames) { FileInputStream inStream = new FileInputStream(new File(folder, fileName)); DigestURL location = new DigestURL("http://localhost/" + fileName); try { - Document[] documents = parser.parse(location, "application/gzip", null, new VocabularyScraper(), 0, - inStream); + Document[] documents = parser.parseWithLimits(location, "application/gzip", + StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000, + 10000); assertNotNull("Parser result must not be null for file " + fileName, documents); assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); - assertTrue("Parsed text must contain test word with umlaut char" + fileName, documents[0].getTextString().contains("Maßkrügen")); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertNotNull("Detected URLs must not be null for file " + fileName, anchors); + assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size()); + assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_")); + assertFalse("Parse document must not be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); } finally { inStream.close(); } } + + } + + /** + * Unit test for the gzipParser.parseWithLimits() when maxLinks limit is exceeded + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred + */ + @Test + public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" }; + final File folder = new File("test" + File.separator + "parsertest" + File.separator); + gzipParser parser = new gzipParser(); + + /* maxLinks limit exceeded */ + for (String fileName : fileNames) { + FileInputStream inStream = new FileInputStream(new File(folder, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + Document[] documents = parser.parseWithLimits(location, "application/gzip", + StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty()); + assertTrue("Parsed document must be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } finally { + inStream.close(); + } + } + } + + /** + * Unit test for the gzipParser.parseWithLimits() when maxBytes limit is exceeded + * + * @throws Failure + * when a file could not be parsed + * @throws InterruptedException + * when the test was interrupted before its termination + * @throws IOException + * when a read/write error occurred + */ + @Test + public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" }; + final File folder = new File("test" + File.separator + "parsertest" + File.separator); + gzipParser parser = new gzipParser(); + + String fileName = fileNames[0]; + FileInputStream inStream = new FileInputStream(new File(folder, fileName)); + DigestURL location = new DigestURL("http://localhost/" + fileName); + try { + /* The bytes limit is set to let parsing the beginning text part, but stop before reaching the tag */ + final long maxBytes = 258; + Document[] documents = parser.parseWithLimits(location, "application/gzip", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty()); + assertTrue("Parsed document must be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } finally { + inStream.close(); + } + + fileName = fileNames[1]; + inStream = new FileInputStream(new File(folder, fileName)); + location = new DigestURL("http://localhost/" + fileName); + try { + /* The bytes limit is set to let parsing the beginning of the text, but stop before reaching the URL */ + final long maxBytes = 65; + Document[] documents = parser.parseWithLimits(location, "application/gzip", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + Collection anchors = documents[0].getAnchors(); + assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty()); + assertTrue("Parsed document must be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } finally { + inStream.close(); + } } } diff --git a/test/parsertest/umlaute_html_utf8.html.gz b/test/parsertest/umlaute_html_utf8.html.gz index b5de39ff755047d2c5d60afe11faadbbfbf3d938..37d814a146fbdaf32188b8f042adb04a73bc7fd8 100644 GIT binary patch literal 309 zcmV-50m}X#iwFn>Qju8z19fd|VRdw6Uubk~Y+rSBW;iZrbZu+^Es#NP!Y~Yl@B9ic zlXh#f!6u=#a zZ`KQ75$E5N1?PF055jBw`~syDrc

5t{SmC!38@Nv#eAMdM40BtR0KBmC{vU&d-{ z2n|JaJtKpAmodT%@i3Y|ErsilvR(NRO@kx9dsVi%&PHnk#na>XZJd}|qJfT35}-1$ zvWLofX*^AODf%uR<<=1#Rps>#<_aD7YbZo`G)Ycw@HwK1)5`e`9@XI?V*&V`b8Ty( zrR|9Ktrk7uJ`nDEI!xnmVzcE%v}cW=l{tY&xG@8o+BVqSf=amvUl`a^KBV&xLd#PH HWdQ&H@k5Rx literal 257 zcmV+c0sj6UiwFo|BtBUH19fd|VRdw6Uubk~Y+rSBW;iZrbZu+^Esw!!!!Qs;@BWId z3cbl{oCaKM=MXy(L)--7QhJn~Z6umVskAQ5-`B!94ZCl5hRGkQUDbywv2K`mva!5$5Adw5){#ucftw>Ee~of~~k4cJ_E%Mzw+F=`mR(nduc8@Pvv0 zwSkTO>fDHuG#iv0d>mEQ5u9{4at|ww4#qV!G91mbZ#Vgz(8PJ?LednGi>b*!nU!bc H5di=IhBSL; diff --git a/test/parsertest/umlaute_html_xml_txt_gnu.tgz b/test/parsertest/umlaute_html_xml_txt_gnu.tgz new file mode 100644 index 0000000000000000000000000000000000000000..b10791612712befc870791412ef322a249b033dc GIT binary patch literal 868 zcmV-q1DpIGiwFo1Qju8z19fd|VRdw6Uubk~Y+rb7Y+rPEbYEv~buM&aascg@%}(1u z5XW=QQ%sH2Q|ynqyvX3re&?ZQHVKTC0>yYEqmI z&LjJklcmDr0HrANADlOD{Dm@*@y~{lmkD+xJ4c!5*A4k^VSC5`S8e;=_&c_96@LdK zFXL}Jrd>j+`-dY1qC**x)jgWS?Jsq$OlR}q! zaTQbT{<6v`DUO^^)MMRo;rB^Loj0td-awuozOo1_D?Z!)DU&aU8PhE7`!y zQls2XxParZhvO-{4H*Z#3>{BSs;r#wshwjM8*Y!|4(KXp{iY@zZV(bBz84LWRNNy7 zt+hV!hNFl<6viijF62G~i~S^ELH?jz;>lZ>`tFkH)A`pyet_Hh57T6^@}IB&idE$Q zY6XRl`nNF+DE&VH&qx1T`#XBQ?kwx{?j&$|r~bdsk^Z+6e)*?@FBEWF@4tL}(W1Xb zapQr1G%0f3WRc=8&DVdmRx9eiX=5(HcYqo-mHwXscNL=zr;Mkl>ZV2wQ=1;`Fg|W- z)0(`BhkBMKc+hX`wKrh@aKF3WfkH|_)#akBNn0CDbA&)dPRXQXO~E0}&Yb zlkzUC;N>`NU~i_$@px>EE4dN6yu3_Ii&&P9GxYQ<7T!dU(h zS&}*G;{47eb@4gRQg0x*m8Eb>$GiA+!Y|G-P33z8H{El|EuW{C@cFg=I{tfC{u~%xUD$bIWF@blz}Tmum-;H+mb}3= ziDC@>Y8ksgg0*zEK5tT&k+?aZju0A1h#mb=>;$D*F@#8H;Z+~4F)=wCeadReJ=&sl TiGj