From c6ae87168a6eef116e96fb7907568a2f2435bea7 Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Tue, 22 Aug 2017 14:13:00 +0200
Subject: [PATCH] Added unit tests on the gzip parser.

---
 .../net/yacy/document/parser/gzipParser.java  |   3 +
 .../yacy/document/parser/gzipParserTest.java  | 218 +++++++++++++++++-
 test/parsertest/umlaute_html_utf8.html.gz     | Bin 257 -> 309 bytes
 test/parsertest/umlaute_html_xml_txt_gnu.tgz  | Bin 0 -> 868 bytes
 test/parsertest/umlaute_linux.txt.gz          | Bin 109 -> 165 bytes
 5 files changed, 209 insertions(+), 12 deletions(-)
 create mode 100644 test/parsertest/umlaute_html_xml_txt_gnu.tgz

diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java
index f1a43e0a2..938bfd58a 100644
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@@ -234,6 +234,9 @@ public class gzipParser extends AbstractParser implements Parser {
             Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
             if (docs != null) {
             	maindoc.addSubDocuments(docs);
+            	if(docs.length > 0 && docs[0].isPartiallyParsed()) {
+            		maindoc.setPartiallyParsed(true);
+            	}
             }
         } catch (final Exception e) {
             throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
diff --git a/test/java/net/yacy/document/parser/gzipParserTest.java b/test/java/net/yacy/document/parser/gzipParserTest.java
index 4cfa990a4..ffca1c195 100644
--- a/test/java/net/yacy/document/parser/gzipParserTest.java
+++ b/test/java/net/yacy/document/parser/gzipParserTest.java
@@ -22,14 +22,20 @@
 
 package net.yacy.document.parser;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertEquals;
 
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Collection;
 
 import org.junit.Test;
 
+import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.document.Document;
 import net.yacy.document.Parser.Failure;
@@ -45,32 +51,220 @@ public class gzipParserTest {
 
 	/**
 	 * Unit test for the gzipParser.parse() function with some small gz test files.
-	 * @throws Failure when a file could not be parsed
-	 * @throws InterruptedException when the test was interrupted before its termination
-	 * @throws IOException when a read/write error occurred
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
 	 */
 	@Test
 	public void testParse() throws Failure, InterruptedException, IOException {
-		final String[] fileNames = {
-				"umlaute_html_utf8.html.gz",
-				"umlaute_linux.txt.gz"
-		};
+		final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" };
 		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
 		gzipParser parser = new gzipParser();
-		
+
+		for (String fileName : fileNames) {
+			FileInputStream inStream = new FileInputStream(new File(folder, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try {
+				Document[] documents = parser.parse(location, "application/gzip", StandardCharsets.UTF_8.name(),
+						new VocabularyScraper(), 0, inStream);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
+				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * Testing parse integration with the tar parser on a test tgz archive.
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseTgz() throws Failure, InterruptedException, IOException {
+		final String fileName = "umlaute_html_xml_txt_gnu.tgz";
+		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
+		gzipParser parser = new gzipParser();
+
+		FileInputStream inStream = new FileInputStream(new File(folder, fileName));
+		DigestURL location = new DigestURL("http://localhost/" + fileName);
+		try {
+			Document[] documents = parser.parse(location, "application/gzip", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream);
+			
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+			final String parsedText = documents[0].getTextString();
+			assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+			assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+					parsedText.contains("Maßkrügen"));
+			assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+			assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+			assertTrue(parsedText.contains("URL reference in raw text file"));
+			assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
+
+			final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+			assertNotNull(detectedAnchors);
+			assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
+					detectedAnchors.size());
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+		} finally {
+			inStream.close();
+		}
+	}
+
+	/**
+	 * Unit test for the gzipParser.parseWithLimits() function with some small gz
+	 * test files which content is within limits.
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimits() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" };
+		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
+		gzipParser parser = new gzipParser();
+
 		for (String fileName : fileNames) {
 			FileInputStream inStream = new FileInputStream(new File(folder, fileName));
 			DigestURL location = new DigestURL("http://localhost/" + fileName);
 			try {
-				Document[] documents = parser.parse(location, "application/gzip", null, new VocabularyScraper(), 0,
-						inStream);
+				Document[] documents = parser.parseWithLimits(location, "application/gzip",
+						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000,
+						10000);
 				assertNotNull("Parser result must not be null for file " + fileName, documents);
 				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
-				assertTrue("Parsed text must contain test word with umlaut char" + fileName, documents[0].getTextString().contains("Maßkrügen"));
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertNotNull("Detected URLs must not be null for file " + fileName, anchors);
+				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+				assertFalse("Parse document must not be marked as partially parsed for file " + fileName,
+						documents[0].isPartiallyParsed());
 			} finally {
 				inStream.close();
 			}
 		}
+
+	}
+	
+	/**
+	 * Unit test for the gzipParser.parseWithLimits() when maxLinks limit is exceeded
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" };
+		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
+		gzipParser parser = new gzipParser();
+
+		/* maxLinks limit exceeded */
+		for (String fileName : fileNames) {
+			FileInputStream inStream = new FileInputStream(new File(folder, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try {
+				Document[] documents = parser.parseWithLimits(location, "application/gzip",
+						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+				assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+						documents[0].isPartiallyParsed());
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+	
+	/**
+	 * Unit test for the gzipParser.parseWithLimits() when maxBytes limit is exceeded
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" };
+		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
+		gzipParser parser = new gzipParser();
+
+		String fileName = fileNames[0];
+		FileInputStream inStream = new FileInputStream(new File(folder, fileName));
+		DigestURL location = new DigestURL("http://localhost/" + fileName);
+		try {
+			/* The bytes limit is set to let parsing the beginning text part, but stop before reaching the <a> tag */
+			final long maxBytes = 258;
+			Document[] documents = parser.parseWithLimits(location, "application/gzip", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+					documents[0].getTextString().contains("Maßkrügen"));
+			Collection<AnchorURL> anchors = documents[0].getAnchors();
+			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+					documents[0].isPartiallyParsed());
+		} finally {
+			inStream.close();
+		}
+
+		fileName = fileNames[1];
+		inStream = new FileInputStream(new File(folder, fileName));
+		location = new DigestURL("http://localhost/" + fileName);
+		try {
+			/* The bytes limit is set to let parsing the beginning of the text, but stop before reaching the URL */
+			final long maxBytes = 65;
+			Document[] documents = parser.parseWithLimits(location, "application/gzip", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+					documents[0].getTextString().contains("Maßkrügen"));
+			Collection<AnchorURL> anchors = documents[0].getAnchors();
+			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+					documents[0].isPartiallyParsed());
+		} finally {
+			inStream.close();
+		}
 	}
 
 }
diff --git a/test/parsertest/umlaute_html_utf8.html.gz b/test/parsertest/umlaute_html_utf8.html.gz
index b5de39ff755047d2c5d60afe11faadbbfbf3d938..37d814a146fbdaf32188b8f042adb04a73bc7fd8 100644
GIT binary patch
literal 309
zcmV-50m}X#iwFn>Qju8z19fd|VRdw6Uubk~Y+rSBW;iZrbZu+^Es#NP!Y~Yl@B9ic
zlXh#f!6u=#<p5;@iEbQ<rkzmQ8WNEvEOjUN_e&@z$9eJdmzO@|UyJf*yM$F)Y+$>a
zZ`KQ75$E5N1?PF055jBw`~syDrc<P>5t{SmC!38@Nv#eAMdM40BtR0KBmC{vU&d-{
z2n|JaJtKpAmodT%@i3Y|ErsilvR(NRO@kx9dsVi%&PHnk#na>XZJd}|qJfT35}-1$
zvWLofX*^AODf%uR<<=1#Rps>#<_aD7YbZo`G)Ycw@HwK1)5`e`9@XI?V*&V`b8Ty(
zrR|9Ktrk7uJ`nDEI!xnmVzcE%v}cW=l{tY&xG@8o+BVqSf=amvUl`a^KBV&xLd#PH
HWdQ&H@k5Rx

literal 257
zcmV+c0sj6UiwFo|BtBUH19fd|VRdw6Uubk~Y+rSBW;iZrbZu+^Esw!!!!Qs;@BWId
z3cbl{oCaKM=MXy(L)--7QhJn~Z6umVskAQ5-`B!94ZCl5hRGkQUD<x_*RW}uE$rV{
z+qwkCh4`44LR4)vftUDs2Cb9EYtq)pvk>bywv2K`mva!5$5Adw5){#u<IkXf3)Wjh
zXo%D8iVW^k!3h6Igx3suC7nkq>cftw>Ee~of~~k4cJ_E%Mzw+F=`mR(nduc8@Pvv0
zwSkTO>fDHuG#iv0d>mEQ5u9{4at|ww4#qV!G91mbZ#Vgz(8PJ?LednGi>b*!nU!bc
H5di=IhBSL;

diff --git a/test/parsertest/umlaute_html_xml_txt_gnu.tgz b/test/parsertest/umlaute_html_xml_txt_gnu.tgz
new file mode 100644
index 0000000000000000000000000000000000000000..b10791612712befc870791412ef322a249b033dc
GIT binary patch
literal 868
zcmV-q1DpIGiwFo1Qju8z19fd|VRdw6Uubk~Y+rb7Y+rPEbYEv~buM&aascg@%}(1u
z5XW=QQ%sH2Q|<LSb`pXUq!JKFq#y`tIU&d15DUjn*J~1<rkB1&k$T`kdg;9{z@f7a
zPN956RRaZee^Rtr@9yl_v;P@;AchesJuW6va~#>ynqyvX3re&?ZQHVKTC0>yYEqmI
z&LjJklcmDr0HrANADlOD{Dm@*@y~{lmkD+xJ4c!5*A4k^VSC5`S8e;=_&c_96@LdK
zFXL}Jrd<MaVIDlde;$ALReNWxd$_v}o88Vf>>j+`-dY1qC**x)jgWS?Jsq$OlR}q!
zaT<y+iM@!B^><pU?8=iO8|DSr44Lp?AjC*#AG7eZsrgAPSS)mLHewoF^_rSs6G7zC
z74QchPnl>QbT{<6v`DUO^^)MMRo;rB^Loj0td-awuozOo1_D?Z!)DU&aU8PhE7`!y
zQls2XxParZhvO-{4H*Z#3>{BSs;r#wshwjM8*Y!|4(KXp{iY@zZV(bBz84LWRNNy7
zt+hV!hNFl<6viijF62G~i~S^ELH?jz;>lZ>`tFkH)A`pyet_Hh57T6^@}IB&idE$Q
zY6XRl`nNF+DE&VH&qx1T`#XBQ?kwx{?j&$|r~bdsk^Z+6e)*?@FBEWF@4tL}(W1Xb
zapQr1G%0f3WRc=8&DVdmRx9eiX=5(HcYqo-mHwXscNL=zr;Mkl>ZV2wQ=1;`Fg|W-
z)0(`BhkBMKc+hX`wKrh@aKF3WfkH|_)<co_31>#akBNn0CDbA&)dPRXQXO~E0}&Yb
zlkzUC;N>`NU~i_$@px>EE4dN6yu3_Ii&&P9GxYQ<7T!dU(<QA81ycioKU+V_c$BXW
ze8M6&#B`GqYEXjP0^}RFS?cp}B=cA+x8w$XLxdt?t(yh?113JfQUU!iVlJ7@nk@>h
zS&}*G;{47eb@4gRQg0x*m8Eb>$GiA+!Y|G-P33z8H{El|EuW{C@cFg=I{t<JF?iX;
z5EHS0Gxa<3zW*#*y#L!Z6TbyaYE|u;(*INN<IC4Y99*M(u(u6ZTNy|3FgeP-F~~BX
uiw7ETZ_S+@y%$n46$*typ-?Ck3WY+UP$(1%g+ift_WT6kFvMyAC;$M99i}?~

literal 0
HcmV?d00001

diff --git a/test/parsertest/umlaute_linux.txt.gz b/test/parsertest/umlaute_linux.txt.gz
index f576a9b1a667dde487f58523b68a3a410259f748..a86b3c5c870da8c089dd7f8f516539933b2903a7 100644
GIT binary patch
literal 165
zcmV;W09yYaiwFpjPLWvv19fd|VRdw6Uu<b^b$Bjxcys_2%CQZ?Fc1aMK2!Vw5bje_
zfkYZ4YUG$pY+=smd{(jz4O<Y2!UM4ap?>fC{u~%xUD$bIWF@blz}Tmum-;H+mb}3=
ziDC@>Y8ksgg0*zEK5tT&k+?aZju0A1h#mb=>;$D*F@#8H;Z+~4F)=wCeadReJ=&sl
TiGj<!)mfV#)ULG@i~s-tiz-OS

literal 109
zcmV-z0FwV7iwFo|BtBUH19fd|VRdw6Uu<b^b$Bjxcys`Jzk9!Do`Ub;J$cC)sd);;
zC8-%D3aOcS3Lg1sNkxa3lx8HB7V9Xa<QJ7Fq-Q1p<%$)YGE<8bfJ%H556{moI=m-6
PHIEAbnn<=)NB{r;8#pPD