added parsing of dd, dt and article html fields. The parsed result is

written to special solr fields which are deactivated by default.
pull/1/head
Michael Peter Christen 10 years ago
parent 1395f10e95
commit 4cb4f67f38

@ -314,6 +314,24 @@ images_width_val
## number of <li> tags, int ## number of <li> tags, int
#licount_i #licount_i
## all texts in <dt> tags
#dt_txt
## number of <dt> tags, int
#dtcount_i
## all texts in <dd> tags
#dd_txt
## number of <dd> tags, int
#ddcount_i
## all texts in <article> tags
#article_txt
## number of <article> tags, int
#articlecount_i
## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order ## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order
bold_txt bold_txt

@ -4,10 +4,6 @@
// first published on http://www.anomic.de // first published on http://www.anomic.de
// Frankfurt, Germany, 2004 // Frankfurt, Germany, 2004
// //
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or // the Free Software Foundation; either version 2 of the License, or
@ -114,6 +110,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
u(TagType.pair), u(TagType.pair),
i(TagType.pair), i(TagType.pair),
li(TagType.pair), li(TagType.pair),
dt(TagType.pair),
dd(TagType.pair),
script(TagType.pair), script(TagType.pair),
span(TagType.pair), span(TagType.pair),
div(TagType.pair), div(TagType.pair),
@ -182,7 +180,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
//private String headline; //private String headline;
private List<String>[] headlines; private List<String>[] headlines;
private final ClusteredScoreMap<String> bold, italic, underline; private final ClusteredScoreMap<String> bold, italic, underline;
private final List<String> li; private final List<String> li, dt, dd;
private final CharBuffer content; private final CharBuffer content;
private final EventListenerList htmlFilterEventListeners; private final EventListenerList htmlFilterEventListeners;
private double lon, lat; private double lon, lat;
@ -242,6 +240,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.italic = new ClusteredScoreMap<String>(false); this.italic = new ClusteredScoreMap<String>(false);
this.underline = new ClusteredScoreMap<String>(false); this.underline = new ClusteredScoreMap<String>(false);
this.li = new ArrayList<String>(); this.li = new ArrayList<String>();
this.dt = new ArrayList<String>();
this.dd = new ArrayList<String>();
this.content = new CharBuffer(MAX_DOCSIZE, 1024); this.content = new CharBuffer(MAX_DOCSIZE, 1024);
this.htmlFilterEventListeners = new EventListenerList(); this.htmlFilterEventListeners = new EventListenerList();
this.lon = 0.0d; this.lon = 0.0d;
@ -591,6 +591,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) { } else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.li.add(h); if (h.length() > 0) this.li.add(h);
} else if ((tag.name.equalsIgnoreCase("dt")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.dt.add(h);
} else if ((tag.name.equalsIgnoreCase("dd")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.dd.add(h);
} else if (tag.name.equalsIgnoreCase("script")) { } else if (tag.name.equalsIgnoreCase("script")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING); final String src = tag.opts.getProperty("src", EMPTY_STRING);
if (src.length() > 0) { if (src.length() > 0) {
@ -734,6 +740,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.li.toArray(new String[this.li.size()]); return this.li.toArray(new String[this.li.size()]);
} }
public String[] getDt() {
return this.dt.toArray(new String[this.dt.size()]);
}
public String[] getDd() {
return this.dd.toArray(new String[this.dd.size()]);
}
public DigestURL[] getFlash() { public DigestURL[] getFlash() {
String ext; String ext;
ArrayList<DigestURL> f = new ArrayList<DigestURL>(); ArrayList<DigestURL> f = new ArrayList<DigestURL>();
@ -760,22 +774,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
public String getText() { public String getText() {
if (this.articles.size() > 0) {
StringBuilder sb = new StringBuilder();
for (String al: this.articles) {
sb.append(al).append(' ');
}
if (sb.length() > this.articles.size()) return sb.toString().trim();
}
this.content.trim();
try { try {
return this.content.toString(); return this.content.trim().toString();
} catch (final OutOfMemoryError e) { } catch (final OutOfMemoryError e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
return ""; return "";
} }
} }
public List<String> getArticles() {
return this.articles;
}
public List<AnchorURL> getAnchors() { public List<AnchorURL> getAnchors() {
// returns a url (String) / name (String) relation // returns a url (String) / name (String) relation
return this.anchors; return this.anchors;

@ -647,6 +647,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final String[] li = html.getLi(); final String[] li = html.getLi();
add(doc, CollectionSchema.licount_i, li.length); add(doc, CollectionSchema.licount_i, li.length);
if (li.length > 0) add(doc, CollectionSchema.li_txt, li); if (li.length > 0) add(doc, CollectionSchema.li_txt, li);
final String[] dt = html.getDt();
add(doc, CollectionSchema.dtcount_i, dt.length);
if (dt.length > 0) add(doc, CollectionSchema.dt_txt, li);
final String[] dd = html.getLi();
add(doc, CollectionSchema.ddcount_i, dd.length);
if (dd.length > 0) add(doc, CollectionSchema.dd_txt, li);
final List<String> articles = html.getArticles();
add(doc, CollectionSchema.articlecount_i, articles.size());
if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles);
// images // images
final ArrayList<String> imgprots = new ArrayList<String>(images.size()); final ArrayList<String> imgprots = new ArrayList<String>(images.size());

@ -151,6 +151,12 @@ public enum CollectionSchema implements SchemaDeclaration {
refresh_s(SolrType.string, true, true, false, false, false, "link from the url property inside the refresh link element"), refresh_s(SolrType.string, true, true, false, false, false, "link from the url property inside the refresh link element"),
li_txt(SolrType.text_general, true, true, true, false, true, "all texts in <li> tags"), li_txt(SolrType.text_general, true, true, true, false, true, "all texts in <li> tags"),
licount_i(SolrType.num_integer, true, true, false, false, false, "number of <li> tags"), licount_i(SolrType.num_integer, true, true, false, false, false, "number of <li> tags"),
dt_txt(SolrType.text_general, true, true, true, false, true, "all texts in <dt> tags"),
dtcount_i(SolrType.num_integer, true, true, false, false, false, "number of <dt> tags"),
dd_txt(SolrType.text_general, true, true, true, false, true, "all texts in <dd> tags"),
ddcount_i(SolrType.num_integer, true, true, false, false, false, "number of <dd> tags"),
article_txt(SolrType.text_general, true, true, true, false, true, "all texts in <article> tags"),
articlecount_i(SolrType.num_integer, true, true, false, false, false, "number of <article> tags"),
bold_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"), bold_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
boldcount_i(SolrType.num_integer, true, true, false, false, false, "total number of occurrences of <b> or <strong>"), boldcount_i(SolrType.num_integer, true, true, false, false, false, "total number of occurrences of <b> or <strong>"),
italic_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"), italic_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),

Loading…
Cancel
Save