Added a html parser charset detection unit test

pull/144/head
luccioman 7 years ago
parent d14c47d4d3
commit 73977ec0fe

@ -98,11 +98,46 @@ public class htmlParserTest extends TestCase {
inStream.close();
}
}
}
}
/**
* Test the htmlParser.parse() method, with no charset information, neither
* provided by HTTP header nor by meta tags or attributes.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseHtmlWithoutCharset() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
/*
* Include some non ASCII characters : once encoded they should make the charset
* detector to detect the exact encoding
*/
testHtml.append("In München steht ein Hofbräuhaus.\n" + "Dort gibt es Bier aus Maßkrügen.<br>");
testHtml.append("<a href=\"http://localhost/doc1.html\">First link</a>");
testHtml.append("<a href=\"http://localhost/doc2.html\">Second link</a>");
testHtml.append("<a href=\"http://localhost/doc3.html\">Third link</a>");
testHtml.append("</p></body></html>");
final htmlParser parser = new htmlParser();
final Charset[] charsets = new Charset[] { StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1 };
for (final Charset charset : charsets) {
try (InputStream sourceStream = new ByteArrayInputStream(testHtml.toString().getBytes(charset));) {
final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
final Document doc = docs[0];
assertEquals(3, doc.getAnchors().size());
assertTrue(doc.getTextString().contains("Maßkrügen"));
assertEquals(charset.toString(), doc.getCharset());
}
}
}
/**
* Test the htmlParser.parseWithLimits() method with test content within bounds.
* @throws Exception when an unexpected error occurred

Loading…
Cancel
Save