Added a html parser charset detection unit test

7 years ago · 73977ec0fe
parent d14c47d4d3
commit 73977ec0fe
1 changed files with 37 additions and 2 deletions
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@ -98,11 +98,46 @@ public class htmlParserTest extends TestCase {
            		inStream.close();
            	}
            }
-
-
        }
    }
    
+	/**
+	 * Test the htmlParser.parse() method, with no charset information, neither
+	 * provided by HTTP header nor by meta tags or attributes.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseHtmlWithoutCharset() throws Exception {
+		final AnchorURL url = new AnchorURL("http://localhost/test.html");
+		final String mimetype = "text/html";
+		final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
+		/*
+		 * Include some non ASCII characters : once encoded they should make the charset
+		 * detector to detect the exact encoding
+		 */
+		testHtml.append("In München steht ein Hofbräuhaus.\n" + "Dort gibt es Bier aus Maßkrügen.<br>");
+		testHtml.append("<a href=\"http://localhost/doc1.html\">First link</a>");
+		testHtml.append("<a href=\"http://localhost/doc2.html\">Second link</a>");
+		testHtml.append("<a href=\"http://localhost/doc3.html\">Third link</a>");
+		testHtml.append("</p></body></html>");
+
+		final htmlParser parser = new htmlParser();
+
+		final Charset[] charsets = new Charset[] { StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1 };
+
+		for (final Charset charset : charsets) {
+			try (InputStream sourceStream = new ByteArrayInputStream(testHtml.toString().getBytes(charset));) {
+				final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
+				final Document doc = docs[0];
+				assertEquals(3, doc.getAnchors().size());
+				assertTrue(doc.getTextString().contains("Maßkrügen"));
+				assertEquals(charset.toString(), doc.getCharset());
+			}
+		}
+	}
+    
    /**
     * Test the htmlParser.parseWithLimits() method with test content within bounds.
     * @throws Exception when an unexpected error occurred