diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index b3b8bdac2..29314a27c 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -116,7 +116,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { li(TagType.pair), script(TagType.pair), span(TagType.pair), - div(TagType.pair); + div(TagType.pair), + article(TagType.pair); public TagType type; private TagName(final TagType type) { @@ -177,6 +178,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final Map metas; private final Map hreflang, navigation; private LinkedHashSet titles; + private final List articles; //private String headline; private List[] headlines; private final ClusteredScoreMap bold, italic, underline; @@ -233,6 +235,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.navigation = new SizeLimitedMap(maxLinks); this.script = new SizeLimitedSet(maxLinks); this.titles = new LinkedHashSet(); + this.articles = new ArrayList(); this.headlines = (List[]) Array.newInstance(ArrayList.class, 6); for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList(); this.bold = new ClusteredScoreMap(false); @@ -596,6 +599,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else { this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" ")); } + } else if (tag.name.equalsIgnoreCase("article")) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); + if (h.length() > 0) this.articles.add(h); } // fire event @@ -754,6 +760,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { } public String getText() { + if (this.articles.size() > 0) { + StringBuilder sb = new StringBuilder(); + for (String al: this.articles) { + sb.append(al).append(' '); + } + if (sb.length() > this.articles.size()) return sb.toString().trim(); + } this.content.trim(); try { return this.content.toString();