added an underline text field to solr to record all underlined texts

pull/1/head
Michael Peter Christen 13 years ago
parent be4c96f3b1
commit 411d0e839b

@ -282,6 +282,15 @@ italic_txt
## total number of occurrences of <i>, int
#italiccount_i
## all texts inside of <u> tags. no doubles. listed in the order of number of occurrences in decreasing order
underline_txt
## number of occurrences of texts in underline_txt
#underline_val
## total number of occurrences of <u>, int
#underlinecount_i
## flag that shows if a swf file is linked, boolean
#flash_b

@ -127,6 +127,8 @@ public enum YaCySchema implements Schema {
boldcount_i(SolrType.integer, true, true, false, "total number of occurrences of <b> or <strong>"),
italic_txt(SolrType.text_general, true, true, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italiccount_i(SolrType.integer, true, true, false, "total number of occurrences of <i>"),
underline_txt(SolrType.text_general, true, true, true, "all texts inside of <u> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
underlinecount_i(SolrType.integer, true, true, false, "total number of occurrences of <u>"),
flash_b(SolrType.bool, true, true, false, "flag that shows if a swf file is linked"),
frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"),
framesscount_i(SolrType.integer, true, true, false, "number of frames_txt"),
@ -165,6 +167,7 @@ public enum YaCySchema implements Schema {
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
underline_val(SolrType.integer, true, true, true, "number of occurrences of texts in underline_txt"),
ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"),

@ -100,6 +100,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
title(TagType.pair),
b(TagType.pair),
strong(TagType.pair),
u(TagType.pair),
i(TagType.pair),
li(TagType.pair),
script(TagType.pair),
@ -130,7 +131,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private Collection<String> titles;
//private String headline;
private List<String>[] headlines;
private final ClusteredScoreMap<String> bold, italic;
private final ClusteredScoreMap<String> bold, italic, underline;
private final List<String> li;
private final CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
@ -177,6 +178,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>();
this.italic = new ClusteredScoreMap<String>();
this.underline = new ClusteredScoreMap<String>();
this.li = new ArrayList<String>();
this.content = new CharBuffer(MAX_DOCSIZE, 1024);
this.htmlFilterEventListeners = new EventListenerList();
@ -494,6 +496,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.italic.inc(h);
} else if ((tagname.equalsIgnoreCase("u")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.underline.inc(h);
} else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.li.add(h);
@ -609,6 +614,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return counter;
}
public String[] getUnderline() {
final List<String> a = new ArrayList<String>();
final Iterator<String> i = this.underline.keys(false);
while (i.hasNext()) a.add(i.next());
return a.toArray(new String[a.size()]);
}
public String[] getUnderlineCount(final String[] a) {
final String[] counter = new String[a.length];
for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(this.underline.get(a[i]));
return counter;
}
public String[] getLi() {
return this.li.toArray(new String[this.li.size()]);
}

@ -506,6 +506,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
add(doc, YaCySchema.italic_val, html.getItalicCount(italic));
}
}
final String[] underline = html.getUnderline();
add(doc, YaCySchema.underlinecount_i, underline.length);
if (underline.length > 0) {
add(doc, YaCySchema.underline_txt, underline);
if (allAttr || contains(YaCySchema.underline_val)) {
add(doc, YaCySchema.underline_val, html.getUnderlineCount(underline));
}
}
final String[] li = html.getLi();
add(doc, YaCySchema.licount_i, li.length);
if (li.length > 0) add(doc, YaCySchema.li_txt, li);

Loading…
Cancel
Save