Added a html parser charset detection unit test

luccioman 7 years ago
parent d14c47d4d3
commit 73977ec0fe

@ -98,11 +98,46 @@ public class htmlParserTest extends TestCase {
* Test the htmlParser.parse() method, with no charset information, neither
* provided by HTTP header nor by meta tags or attributes.
* @throws Exception
* when an unexpected error occurred
public void testParseHtmlWithoutCharset() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
* Include some non ASCII characters : once encoded they should make the charset
* detector to detect the exact encoding
testHtml.append("In München steht ein Hofbräuhaus.\n" + "Dort gibt es Bier aus Maßkrügen.<br>");
testHtml.append("<a href=\"http://localhost/doc1.html\">First link</a>");
testHtml.append("<a href=\"http://localhost/doc2.html\">Second link</a>");
testHtml.append("<a href=\"http://localhost/doc3.html\">Third link</a>");
final htmlParser parser = new htmlParser();
final Charset[] charsets = new Charset[] { StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1 };
for (final Charset charset : charsets) {
try (InputStream sourceStream = new ByteArrayInputStream(testHtml.toString().getBytes(charset));) {
final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
final Document doc = docs[0];
assertEquals(3, doc.getAnchors().size());
assertEquals(charset.toString(), doc.getCharset());
* Test the htmlParser.parseWithLimits() method with test content within bounds.
* @throws Exception when an unexpected error occurred
