diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 17f9362c7..77b59ad79 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -115,7 +115,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { script(TagType.pair), span(TagType.pair), div(TagType.pair), - article(TagType.pair); + article(TagType.pair), + // tags used to capture tag content + // TODO: considere to use or as trigger to scape for text content + style(TagType.pair); // embedded css (if not declared as tag content is parsed as text) public TagType type; private TagName(final TagType type) { diff --git a/test/net/yacy/document/parser/htmlParserTest.java b/test/net/yacy/document/parser/htmlParserTest.java index 1ab31f9bf..2d7ad5b73 100644 --- a/test/net/yacy/document/parser/htmlParserTest.java +++ b/test/net/yacy/document/parser/htmlParserTest.java @@ -118,4 +118,26 @@ public class htmlParserTest extends TestCase { ImageEntry img = scraper.getImages().get(1); assertEquals(550,img.width()); } + + /** + * Test of parseToScraper method, of class htmlParser + * for scraping tag content from text (special test to verify " + + "" + + "

" + textSource + "

" + + ""; + + ContentScraper scraper = parseToScraper(url, mimetype, new VocabularyScraper(), 0, testhtml, 10); + + String txt = scraper.getText(); + System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]"); + assertEquals(txt, textSource); + } }