<experimental> added parsing of <article> html element.

Whenever such an element occurs, the complete content of all article
elements replaces the parsed <content> part of documents.
pull/1/head
Michael Peter Christen 10 years ago
parent 1df6492019
commit 4d00175157

@ -116,7 +116,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
li(TagType.pair), li(TagType.pair),
script(TagType.pair), script(TagType.pair),
span(TagType.pair), span(TagType.pair),
div(TagType.pair); div(TagType.pair),
article(TagType.pair);
public TagType type; public TagType type;
private TagName(final TagType type) { private TagName(final TagType type) {
@ -177,6 +178,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Map<String, String> metas; private final Map<String, String> metas;
private final Map<String, DigestURL> hreflang, navigation; private final Map<String, DigestURL> hreflang, navigation;
private LinkedHashSet<String> titles; private LinkedHashSet<String> titles;
private final List<String> articles;
//private String headline; //private String headline;
private List<String>[] headlines; private List<String>[] headlines;
private final ClusteredScoreMap<String> bold, italic, underline; private final ClusteredScoreMap<String> bold, italic, underline;
@ -233,6 +235,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.navigation = new SizeLimitedMap<String, DigestURL>(maxLinks); this.navigation = new SizeLimitedMap<String, DigestURL>(maxLinks);
this.script = new SizeLimitedSet<AnchorURL>(maxLinks); this.script = new SizeLimitedSet<AnchorURL>(maxLinks);
this.titles = new LinkedHashSet<String>(); this.titles = new LinkedHashSet<String>();
this.articles = new ArrayList<String>();
this.headlines = (List<String>[]) Array.newInstance(ArrayList.class, 6); this.headlines = (List<String>[]) Array.newInstance(ArrayList.class, 6);
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>(); for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>(false); this.bold = new ClusteredScoreMap<String>(false);
@ -596,6 +599,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else { } else {
this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" ")); this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" "));
} }
} else if (tag.name.equalsIgnoreCase("article")) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.articles.add(h);
} }
// fire event // fire event
@ -754,6 +760,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
public String getText() { public String getText() {
if (this.articles.size() > 0) {
StringBuilder sb = new StringBuilder();
for (String al: this.articles) {
sb.append(al).append(' ');
}
if (sb.length() > this.articles.size()) return sb.toString().trim();
}
this.content.trim(); this.content.trim();
try { try {
return this.content.toString(); return this.content.toString();

Loading…
Cancel
Save