fix html parser taking <style> content as text.

Noticed some result description contain css content from style tag.
Added <style> to tag list to scrape it's content not as text
+ test case included
pull/14/head
reger 9 years ago
parent 5f706797cb
commit d2cc11ea8f

@ -115,7 +115,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
script(TagType.pair),
span(TagType.pair),
div(TagType.pair),
article(TagType.pair);
article(TagType.pair),
// tags used to capture tag content
// TODO: considere to use </head> or <body> as trigger to scape for text content
style(TagType.pair); // embedded css (if not declared as tag content is parsed as text)
public TagType type;
private TagName(final TagType type) {

@ -118,4 +118,26 @@ public class htmlParserTest extends TestCase {
ImageEntry img = scraper.getImages().get(1);
assertEquals(550,img.width());
}
/**
* Test of parseToScraper method, of class htmlParser
* for scraping tag content from text (special test to verify <style> not counted as text
*/
@Test
public void testParseToScraper_TagTest() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/");
final String mimetype = "text/html";
final String textSource = "test text";
final String testhtml = "<html>"
+ "<head><style type=\"text/css\"> h1 { color: #ffffff; }</style></head>"
+ "<body>"
+ "<p>" + textSource + "</p>"
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, mimetype, new VocabularyScraper(), 0, testhtml, 10);
String txt = scraper.getText();
System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
assertEquals(txt, textSource);
}
}

Loading…
Cancel
Save