diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema
index 3c77c5c05..a347f6107 100644
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@@ -314,6 +314,24 @@ images_width_val
## number of
tags, int
#licount_i
+## all texts in tags
+#dt_txt
+
+## number of tags, int
+#dtcount_i
+
+## all texts in tags
+#dd_txt
+
+## number of tags, int
+#ddcount_i
+
+## all texts in tags
+#article_txt
+
+## number of tags, int
+#articlecount_i
+
## all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order
bold_txt
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 29314a27c..5fa26e440 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -4,10 +4,6 @@
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@@ -114,6 +110,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
u(TagType.pair),
i(TagType.pair),
li(TagType.pair),
+ dt(TagType.pair),
+ dd(TagType.pair),
script(TagType.pair),
span(TagType.pair),
div(TagType.pair),
@@ -182,7 +180,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
//private String headline;
private List[] headlines;
private final ClusteredScoreMap bold, italic, underline;
- private final List li;
+ private final List li, dt, dd;
private final CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private double lon, lat;
@@ -242,6 +240,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.italic = new ClusteredScoreMap(false);
this.underline = new ClusteredScoreMap(false);
this.li = new ArrayList();
+ this.dt = new ArrayList();
+ this.dd = new ArrayList();
this.content = new CharBuffer(MAX_DOCSIZE, 1024);
this.htmlFilterEventListeners = new EventListenerList();
this.lon = 0.0d;
@@ -591,6 +591,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.li.add(h);
+ } else if ((tag.name.equalsIgnoreCase("dt")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
+ if (h.length() > 0) this.dt.add(h);
+ } else if ((tag.name.equalsIgnoreCase("dd")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
+ if (h.length() > 0) this.dd.add(h);
} else if (tag.name.equalsIgnoreCase("script")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
if (src.length() > 0) {
@@ -734,6 +740,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.li.toArray(new String[this.li.size()]);
}
+ public String[] getDt() {
+ return this.dt.toArray(new String[this.dt.size()]);
+ }
+
+ public String[] getDd() {
+ return this.dd.toArray(new String[this.dd.size()]);
+ }
+
public DigestURL[] getFlash() {
String ext;
ArrayList f = new ArrayList();
@@ -760,22 +774,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public String getText() {
- if (this.articles.size() > 0) {
- StringBuilder sb = new StringBuilder();
- for (String al: this.articles) {
- sb.append(al).append(' ');
- }
- if (sb.length() > this.articles.size()) return sb.toString().trim();
- }
- this.content.trim();
try {
- return this.content.toString();
+ return this.content.trim().toString();
} catch (final OutOfMemoryError e) {
ConcurrentLog.logException(e);
return "";
}
}
+ public List getArticles() {
+ return this.articles;
+ }
+
public List getAnchors() {
// returns a url (String) / name (String) relation
return this.anchors;
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index e82b61112..c3a8462a0 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -647,6 +647,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final String[] li = html.getLi();
add(doc, CollectionSchema.licount_i, li.length);
if (li.length > 0) add(doc, CollectionSchema.li_txt, li);
+
+ final String[] dt = html.getDt();
+ add(doc, CollectionSchema.dtcount_i, dt.length);
+ if (dt.length > 0) add(doc, CollectionSchema.dt_txt, li);
+
+ final String[] dd = html.getLi();
+ add(doc, CollectionSchema.ddcount_i, dd.length);
+ if (dd.length > 0) add(doc, CollectionSchema.dd_txt, li);
+
+ final List articles = html.getArticles();
+ add(doc, CollectionSchema.articlecount_i, articles.size());
+ if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles);
// images
final ArrayList imgprots = new ArrayList(images.size());
diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java
index ab73f9ea0..c9cf43a01 100644
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@@ -151,6 +151,12 @@ public enum CollectionSchema implements SchemaDeclaration {
refresh_s(SolrType.string, true, true, false, false, false, "link from the url property inside the refresh link element"),
li_txt(SolrType.text_general, true, true, true, false, true, "all texts in tags"),
licount_i(SolrType.num_integer, true, true, false, false, false, "number of tags"),
+ dt_txt(SolrType.text_general, true, true, true, false, true, "all texts in tags"),
+ dtcount_i(SolrType.num_integer, true, true, false, false, false, "number of tags"),
+ dd_txt(SolrType.text_general, true, true, true, false, true, "all texts in tags"),
+ ddcount_i(SolrType.num_integer, true, true, false, false, false, "number of tags"),
+ article_txt(SolrType.text_general, true, true, true, false, true, "all texts in tags"),
+ articlecount_i(SolrType.num_integer, true, true, false, false, false, "number of tags"),
bold_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order"),
boldcount_i(SolrType.num_integer, true, true, false, false, false, "total number of occurrences of or "),
italic_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order"),