fix html parser taking <style> content as text.

Noticed some result description contain css content from style tag. Added <style> to tag list to scrape it's content not as text + test case included
10 years ago · d2cc11ea8f
parent 5f706797cb
commit d2cc11ea8f
2 changed files with 26 additions and 1 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -115,7 +115,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        script(TagType.pair),
        span(TagType.pair),
        div(TagType.pair),
-        article(TagType.pair);
+        article(TagType.pair),
+        // tags used to capture tag content
+        // TODO: considere to use </head> or <body> as trigger to scape for text content
+        style(TagType.pair); // embedded css (if not declared as tag content is parsed as text)

        public TagType type;
        private TagName(final TagType type) {
--- a/test/net/yacy/document/parser/htmlParserTest.java
+++ b/test/net/yacy/document/parser/htmlParserTest.java
@ -118,4 +118,26 @@ public class htmlParserTest extends TestCase {
        ImageEntry img = scraper.getImages().get(1);
        assertEquals(550,img.width());
    }
+
+    /**
+     * Test of parseToScraper method, of class htmlParser
+     * for scraping tag content from text (special test to verify <style> not counted as text
+     */
+    @Test
+    public void testParseToScraper_TagTest() throws Exception {
+        final AnchorURL url = new AnchorURL("http://localhost/");
+        final String mimetype = "text/html";
+        final String textSource = "test text";
+        final String testhtml = "<html>"
+                + "<head><style type=\"text/css\"> h1 { color: #ffffff; }</style></head>"
+                + "<body>"
+                + "<p>" + textSource + "</p>"
+                + "</body></html>";
+
+        ContentScraper scraper = parseToScraper(url, mimetype, new VocabularyScraper(), 0, testhtml, 10);
+
+        String txt = scraper.getText();
+        System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
+        assertEquals(txt, textSource);
+    }
 }