diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 3c77c5c05..a347f6107 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -314,6 +314,24 @@ images_width_val ## number of
  • tags, int #licount_i +## all texts in
    tags +#dt_txt + +## number of
    tags, int +#dtcount_i + +## all texts in
    tags +#dd_txt + +## number of
    tags, int +#ddcount_i + +## all texts in
    tags +#article_txt + +## number of
    tags, int +#articlecount_i + ## all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order bold_txt diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 29314a27c..5fa26e440 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -4,10 +4,6 @@ // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -114,6 +110,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { u(TagType.pair), i(TagType.pair), li(TagType.pair), + dt(TagType.pair), + dd(TagType.pair), script(TagType.pair), span(TagType.pair), div(TagType.pair), @@ -182,7 +180,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { //private String headline; private List[] headlines; private final ClusteredScoreMap bold, italic, underline; - private final List li; + private final List li, dt, dd; private final CharBuffer content; private final EventListenerList htmlFilterEventListeners; private double lon, lat; @@ -242,6 +240,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.italic = new ClusteredScoreMap(false); this.underline = new ClusteredScoreMap(false); this.li = new ArrayList(); + this.dt = new ArrayList(); + this.dd = new ArrayList(); this.content = new CharBuffer(MAX_DOCSIZE, 1024); this.htmlFilterEventListeners = new EventListenerList(); this.lon = 0.0d; @@ -591,6 +591,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) { h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.li.add(h); + } else if ((tag.name.equalsIgnoreCase("dt")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); + if (h.length() > 0) this.dt.add(h); + } else if ((tag.name.equalsIgnoreCase("dd")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); + if (h.length() > 0) this.dd.add(h); } else if (tag.name.equalsIgnoreCase("script")) { final String src = tag.opts.getProperty("src", EMPTY_STRING); if (src.length() > 0) { @@ -734,6 +740,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { return this.li.toArray(new String[this.li.size()]); } + public String[] getDt() { + return this.dt.toArray(new String[this.dt.size()]); + } + + public String[] getDd() { + return this.dd.toArray(new String[this.dd.size()]); + } + public DigestURL[] getFlash() { String ext; ArrayList f = new ArrayList(); @@ -760,22 +774,18 @@ public class ContentScraper extends AbstractScraper implements Scraper { } public String getText() { - if (this.articles.size() > 0) { - StringBuilder sb = new StringBuilder(); - for (String al: this.articles) { - sb.append(al).append(' '); - } - if (sb.length() > this.articles.size()) return sb.toString().trim(); - } - this.content.trim(); try { - return this.content.toString(); + return this.content.trim().toString(); } catch (final OutOfMemoryError e) { ConcurrentLog.logException(e); return ""; } } + public List getArticles() { + return this.articles; + } + public List getAnchors() { // returns a url (String) / name (String) relation return this.anchors; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index e82b61112..c3a8462a0 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -647,6 +647,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final String[] li = html.getLi(); add(doc, CollectionSchema.licount_i, li.length); if (li.length > 0) add(doc, CollectionSchema.li_txt, li); + + final String[] dt = html.getDt(); + add(doc, CollectionSchema.dtcount_i, dt.length); + if (dt.length > 0) add(doc, CollectionSchema.dt_txt, li); + + final String[] dd = html.getLi(); + add(doc, CollectionSchema.ddcount_i, dd.length); + if (dd.length > 0) add(doc, CollectionSchema.dd_txt, li); + + final List articles = html.getArticles(); + add(doc, CollectionSchema.articlecount_i, articles.size()); + if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles); // images final ArrayList imgprots = new ArrayList(images.size()); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index ab73f9ea0..c9cf43a01 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -151,6 +151,12 @@ public enum CollectionSchema implements SchemaDeclaration { refresh_s(SolrType.string, true, true, false, false, false, "link from the url property inside the refresh link element"), li_txt(SolrType.text_general, true, true, true, false, true, "all texts in
  • tags"), licount_i(SolrType.num_integer, true, true, false, false, false, "number of
  • tags"), + dt_txt(SolrType.text_general, true, true, true, false, true, "all texts in
    tags"), + dtcount_i(SolrType.num_integer, true, true, false, false, false, "number of
    tags"), + dd_txt(SolrType.text_general, true, true, true, false, true, "all texts in
    tags"), + ddcount_i(SolrType.num_integer, true, true, false, false, false, "number of
    tags"), + article_txt(SolrType.text_general, true, true, true, false, true, "all texts in
    tags"), + articlecount_i(SolrType.num_integer, true, true, false, false, false, "number of
    tags"), bold_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order"), boldcount_i(SolrType.num_integer, true, true, false, false, false, "total number of occurrences of or "), italic_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order"),