From 73977ec0fe40cd92a741f38b95e78292b9fdf10b Mon Sep 17 00:00:00 2001 From: luccioman Date: Mon, 6 Nov 2017 09:14:03 +0100 Subject: [PATCH] Added a html parser charset detection unit test --- .../yacy/document/parser/htmlParserTest.java | 39 ++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/test/java/net/yacy/document/parser/htmlParserTest.java b/test/java/net/yacy/document/parser/htmlParserTest.java index 8d0f1a4f9..e6b67bd35 100644 --- a/test/java/net/yacy/document/parser/htmlParserTest.java +++ b/test/java/net/yacy/document/parser/htmlParserTest.java @@ -98,11 +98,46 @@ public class htmlParserTest extends TestCase { inStream.close(); } } - - } } + /** + * Test the htmlParser.parse() method, with no charset information, neither + * provided by HTTP header nor by meta tags or attributes. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseHtmlWithoutCharset() throws Exception { + final AnchorURL url = new AnchorURL("http://localhost/test.html"); + final String mimetype = "text/html"; + final StringBuilder testHtml = new StringBuilder("

"); + /* + * Include some non ASCII characters : once encoded they should make the charset + * detector to detect the exact encoding + */ + testHtml.append("In München steht ein Hofbräuhaus.\n" + "Dort gibt es Bier aus Maßkrügen.
"); + testHtml.append("First link"); + testHtml.append("Second link"); + testHtml.append("Third link"); + testHtml.append("

"); + + final htmlParser parser = new htmlParser(); + + final Charset[] charsets = new Charset[] { StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1 }; + + for (final Charset charset : charsets) { + try (InputStream sourceStream = new ByteArrayInputStream(testHtml.toString().getBytes(charset));) { + final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream); + final Document doc = docs[0]; + assertEquals(3, doc.getAnchors().size()); + assertTrue(doc.getTextString().contains("Maßkrügen")); + assertEquals(charset.toString(), doc.getCharset()); + } + } + } + /** * Test the htmlParser.parseWithLimits() method with test content within bounds. * @throws Exception when an unexpected error occurred