package net.yacy.document.parser; import static net.yacy.document.parser.htmlParser.parseToScraper; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Locale; import org.junit.Test; import junit.framework.TestCase; import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; public class htmlParserTest extends TestCase { @Test public void testGetRealCharsetEncoding() { String[][] testStrings = new String[][] { new String[]{null,null}, new String[]{"windows1250","windows-1250"}, new String[]{"windows_1250","windows-1250"}, new String[]{"ISO-8859-1", StandardCharsets.ISO_8859_1.name()}, new String[]{"ISO8859-1", StandardCharsets.ISO_8859_1.name()}, new String[]{"ISO-88591", StandardCharsets.ISO_8859_1.name()}, new String[]{"ISO88591", StandardCharsets.ISO_8859_1.name()}, new String[]{"iso_8859_1", StandardCharsets.ISO_8859_1.name()}, new String[]{"cp-1252","windows-1252"}, new String[]{"gb_2312","gb2312"}, // was: x-EUC-CN new String[]{"gb_2312-80","gb2312"}, // was: x-EUC-CN new String[]{"UTF-8;", StandardCharsets.UTF_8.name()} }; for (int i=0; i < testStrings.length; i++) { // desired conversion result String shouldBe = testStrings[i][1]; shouldBe = shouldBe!=null ? shouldBe.toLowerCase(Locale.ROOT) : null; // conversion result String charset = htmlParser.patchCharsetEncoding(testStrings[i][0]); // test if equal assertEquals(shouldBe, charset!=null ? charset.toLowerCase(Locale.ROOT) : null); System.out.println("testGetRealCharsetEncoding: " + (testStrings[i][0]!=null?testStrings[i][0]:"null") + " -> " + (charset!=null?charset:"null") + " | Supported: " + (charset!=null?Charset.isSupported(charset):false)); } } /** * Test of parse method, of class htmlParser. * - test getCharset * @throws IOException */ @Test public void testParse() throws Parser.Failure, InterruptedException, IOException { System.out.println("htmlParser.parse"); String[] testFiles = { "umlaute_html_iso.html", "umlaute_html_utf8.html", "umlaute_html_namedentities.html"}; final String mimetype = "text/html"; //final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen."; for (String testfile : testFiles) { final String filename = "test/parsertest/" + testfile; final File file = new File(filename); final AnchorURL url = new AnchorURL("http://localhost/" + filename); System.out.println("parse file: " + filename); htmlParser p = new htmlParser(); FileInputStream inStream = null; try { inStream = new FileInputStream(file); final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, inStream); Document doc = docs[0]; String txt = doc.getCharset(); assertTrue("get Charset", txt != null); System.out.println("detected charset = " + txt); } finally { if(inStream != null) { inStream.close(); } } } } /** * Test the htmlParser.parseWithLimits() method with test content within bounds. * @throws Exception when an unexpected error occurred */ @Test public void testParseWithLimitsUnreached() throws Exception { System.out.println("htmlParser.parse"); String[] testFiles = { "umlaute_html_iso.html", "umlaute_html_utf8.html", "umlaute_html_namedentities.html"}; final String mimetype = "text/html"; //final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen."; htmlParser parser = new htmlParser(); for (final String testfile : testFiles) { final String fileName = "test" + File.separator + "parsertest" + File.separator + testfile; final File file = new File(fileName); final AnchorURL url = new AnchorURL("http://localhost/" + fileName); try (final FileInputStream inStream = new FileInputStream(file);) { final Document[] docs = parser.parseWithLimits(url, mimetype, null, new VocabularyScraper(), 0, inStream, 1000, 10000); final Document doc = docs[0]; assertNotNull("Parser result must not be null for file " + fileName, docs); final String parsedText = doc.getTextString(); assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, parsedText.contains("Maßkrügen")); assertEquals("Test anchor must have been parsed for file " + fileName, 1, doc.getAnchors().size()); assertFalse("Parsed document should not be marked as partially parsed for file " + fileName, doc.isPartiallyParsed()); } } } /** * Test the htmlParser.parseWithLimits() method, with various maxLinks values * ranging from zero to the exact anchors number contained in the test content. * * @throws Exception * when an unexpected error occurred */ @Test public void testParseWithLimitsOnAnchors() throws Exception { final AnchorURL url = new AnchorURL("http://localhost/test.html"); final String mimetype = "text/html"; final String charset = StandardCharsets.UTF_8.name(); final StringBuilder testHtml = new StringBuilder("

"); testHtml.append("First link"); testHtml.append("Second link"); testHtml.append("Third link"); testHtml.append("

"); final htmlParser parser = new htmlParser(); for (int maxLinks = 0; maxLinks <= 3; maxLinks++) { try (InputStream sourceStream = new ByteArrayInputStream( testHtml.toString().getBytes(StandardCharsets.UTF_8));) { final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0, sourceStream, maxLinks, Long.MAX_VALUE); final Document doc = docs[0]; assertEquals(maxLinks, doc.getAnchors().size()); assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded", maxLinks < 3, doc.isPartiallyParsed()); } } } /** * Test the htmlParser.parseWithLimits() method, with various maxLinks values * ranging from zero the exact RSS feed links number contained in the test * content. * * @throws Exception * when an unexpected error occurred */ @Test public void testParseWithLimitsOnRSSFeeds() throws Exception { final AnchorURL url = new AnchorURL("http://localhost/test.html"); final String mimetype = "text/html"; final String charset = StandardCharsets.UTF_8.name(); final StringBuilder testHtml = new StringBuilder(""); testHtml.append(""); testHtml.append( ""); testHtml.append( ""); testHtml.append( ""); testHtml.append(""); testHtml.append("

HTML test content

"); final htmlParser parser = new htmlParser(); for (int maxLinks = 0; maxLinks <= 3; maxLinks++) { try (InputStream sourceStream = new ByteArrayInputStream( testHtml.toString().getBytes(StandardCharsets.UTF_8));) { final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0, sourceStream, maxLinks, Long.MAX_VALUE); final Document doc = docs[0]; assertEquals(maxLinks, doc.getRSS().size()); assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded", maxLinks < 3, doc.isPartiallyParsed()); } } } /** * Test of parseToScraper method, of class htmlParser. */ @Test public void testParseToScraper_4args() throws Exception { // test link with inline html in text // expectation to deliver pure text as it is possibly indexed in outboundlinks_anchortext_txt/inboundlinks_anchortext_txt final AnchorURL url = new AnchorURL("http://localhost/"); final String charset = StandardCharsets.UTF_8.name(); final String testhtml = "" + "testtext" // "testtext" + " Start" // "Start" + "" // "" + image + "
\"image" // + img width 550 (+html5 figure) + ""; ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10); List anchorlist = scraper.getAnchors(); String linktxt = anchorlist.get(0).getTextProperty(); assertEquals("testtext", linktxt); linktxt = anchorlist.get(1).getTextProperty(); assertEquals("Start", linktxt); linktxt = anchorlist.get(2).getTextProperty(); assertEquals("", linktxt); int cnt = scraper.getImages().size(); assertEquals(2,cnt); ImageEntry img = scraper.getImages().get(1); assertEquals(550,img.width()); } /** * Test parser resistance against nested anchors pattern * ( tag embedding other tags : invalid HTML, but occasionally encountered in some real-world Internet resources. * See case reported at http://forum.yacy-websuche.de/viewtopic.php?f=23&t=6005). * The parser must be able to terminate in a finite time. * @throws IOException when an unexpected error occurred */ @Test public void testParseToScraperNestedAnchors() throws IOException { final AnchorURL url = new AnchorURL("http://localhost/"); final String charset = StandardCharsets.UTF_8.name(); final StringBuilder testHtml = new StringBuilder("

"); /* With prior recursive processing implementation and an average 2017 desktop computer, * computing time started to be problematic over a nesting depth of 21 */ final int nestingDepth = 30; for (int count = 0; count < nestingDepth; count++) { testHtml.append(""); } testHtml.append(""); for (int count = 0; count < nestingDepth; count++) { testHtml.append(""); } testHtml.append("

"); ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), Integer.MAX_VALUE, Integer.MAX_VALUE); assertEquals(nestingDepth, scraper.getAnchors().size()); assertEquals(1, scraper.getImages().size()); } /** * Test of parseToScraper method, of class htmlParser * for scraping tag content from text (special test to verify " + "" + "

" + textSource + "

" + ""; ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10); String txt = scraper.getText(); System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]"); assertEquals(txt, textSource); } /** * Test for parseToScraper of class htmlParser for scraping html with a * \n" + "\n" + "\n" + "" + textSource + "\n" + ""; ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10); String txt = scraper.getText(); System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]"); assertEquals(txt, textSource); } }