From 73977ec0fe40cd92a741f38b95e78292b9fdf10b Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Mon, 6 Nov 2017 09:14:03 +0100
Subject: [PATCH] Added a html parser charset detection unit test

---
 .../yacy/document/parser/htmlParserTest.java  | 39 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 2 deletions(-)
diff --git a/test/java/net/yacy/document/parser/htmlParserTest.java b/test/java/net/yacy/document/parser/htmlParserTest.java
index 8d0f1a4f9..e6b67bd35 100644
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@@ -98,11 +98,46 @@ public class htmlParserTest extends TestCase {
             		inStream.close();
             	}
             }
-
-
         }
     }
     
+	/**
+	 * Test the htmlParser.parse() method, with no charset information, neither
+	 * provided by HTTP header nor by meta tags or attributes.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseHtmlWithoutCharset() throws Exception {
+		final AnchorURL url = new AnchorURL("http://localhost/test.html");
+		final String mimetype = "text/html";
+		final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
+		/*
+		 * Include some non ASCII characters : once encoded they should make the charset
+		 * detector to detect the exact encoding
+		 */
+		testHtml.append("In München steht ein Hofbräuhaus.\n" + "Dort gibt es Bier aus Maßkrügen.<br>");
+		testHtml.append("<a href=\"http://localhost/doc1.html\">First link</a>");
+		testHtml.append("<a href=\"http://localhost/doc2.html\">Second link</a>");
+		testHtml.append("<a href=\"http://localhost/doc3.html\">Third link</a>");
+		testHtml.append("</p></body></html>");
+
+		final htmlParser parser = new htmlParser();
+
+		final Charset[] charsets = new Charset[] { StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1 };
+
+		for (final Charset charset : charsets) {
+			try (InputStream sourceStream = new ByteArrayInputStream(testHtml.toString().getBytes(charset));) {
+				final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
+				final Document doc = docs[0];
+				assertEquals(3, doc.getAnchors().size());
+				assertTrue(doc.getTextString().contains("Maßkrügen"));
+				assertEquals(charset.toString(), doc.getCharset());
+			}
+		}
+	}
+    
     /**
      * Test the htmlParser.parseWithLimits() method with test content within bounds.
      * @throws Exception when an unexpected error occurred