diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 51b50334f..b16188bb5 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -123,8 +123,8 @@ coordinate_p ## content of author-tag, texgen author -## content of description-tag, text -description +## content of description-tag(s), text +description_txt ## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b #description_exact_signature_l diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 2e8ab5887..f81d76a9e 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -324,7 +324,7 @@ public class Load_RSS_p { if (author == null || author.isEmpty()) author = channel == null ? "" : channel.getCopyright(); Date pubDate = channel == null ? null : channel.getPubDate(); prop.putHTML("showitems_author", author == null ? "" : author); - prop.putHTML("showitems_description", channel == null ? "" : channel.getDescription()); + prop.putHTML("showitems_description", channel == null ? "" : channel.getDescriptions().toString()); prop.putHTML("showitems_language", channel == null ? "" : channel.getLanguage()); prop.putHTML("showitems_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL()); @@ -355,7 +355,7 @@ public class Load_RSS_p { prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true)); - prop.putHTML("showitems_item_" + i + "_description", item.getDescription()); + prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString()); prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); i++; diff --git a/htroot/api/feed.java b/htroot/api/feed.java index 43441e46f..bbd473ab2 100644 --- a/htroot/api/feed.java +++ b/htroot/api/feed.java @@ -60,9 +60,10 @@ public class feed { if (feed == null || feed.isEmpty()) continue channelIteration; RSSMessage message = feed.getChannel(); + String description = message.getDescriptions().size() > 0 ? message.getDescriptions().get(0) : ""; if (message != null) { prop.putXML("channel_title", message.getTitle()); - prop.putXML("channel_description", message.getDescription()); + prop.putXML("channel_description", description); prop.put("channel_pubDate", message.getPubDate()); } while (messageMaxCount > 0 && !feed.isEmpty()) { @@ -71,7 +72,7 @@ public class feed { // create RSS entry prop.putXML("item_" + messageCount + "_title", channelName + ": " + message.getTitle()); - prop.putXML("item_" + messageCount + "_description", message.getDescription()); + prop.putXML("item_" + messageCount + "_description", description); prop.putXML("item_" + messageCount + "_link", message.getLink()); prop.put("item_" + messageCount + "_pubDate", message.getPubDate()); prop.putXML("item_" + messageCount + "_guid", message.getGuid()); diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index 8a1e79ecc..e7a2c597a 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -121,7 +121,7 @@ public class getpageinfo { } prop.put("tags", count); // put description - prop.putXML("desc", removelinebreaks(scraper.dc_description())); + prop.putXML("desc", removelinebreaks(scraper.dc_description().length > 0 ? scraper.dc_description()[0] : "")); // put language final Set languages = scraper.getContentLanguages(); prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next()); diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index 73c2a4443..73ca9cbf2 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -121,7 +121,7 @@ public class getpageinfo_p { } prop.put("tags", count); // put description - prop.putXML("desc", scraper.dc_description()); + prop.putXML("desc", scraper.dc_description().length > 0 ? scraper.dc_description()[0] : ""); // put language final Set languages = scraper.getContentLanguages(); prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next()); diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java index a3b1fb45c..7f3d3c6c4 100644 --- a/htroot/gsa/searchresult.java +++ b/htroot/gsa/searchresult.java @@ -155,14 +155,14 @@ public class searchresult { CollectionSchema.id.getSolrFieldName() + ',' + CollectionSchema.sku.getSolrFieldName() + ',' + CollectionSchema.title.getSolrFieldName() + ',' + - CollectionSchema.description.getSolrFieldName() + ',' + + CollectionSchema.description_txt.getSolrFieldName() + ',' + CollectionSchema.load_date_dt.getSolrFieldName() + ',' + CollectionSchema.last_modified.getSolrFieldName() + ',' + CollectionSchema.size_i.getSolrFieldName()); post.put("hl", "true"); post.put("hl.q", originalQuery); post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()); - post.put("hl.alternateField", CollectionSchema.description.getSolrFieldName()); + post.put("hl.alternateField", CollectionSchema.description_txt.getSolrFieldName()); post.put("hl.simple.pre", ""); post.put("hl.simple.post", ""); post.put("hl.fragsize", Integer.toString(SearchEvent.SNIPPET_MAX_LENGTH)); diff --git a/htroot/solr/select.java b/htroot/solr/select.java index 735a99a07..d9a3ae61d 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -202,7 +202,7 @@ public class select { // add options for snippet generation if (!post.containsKey("hl.q")) post.put("hl.q", q); if (!post.containsKey("hl.fl")) post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()); - if (!post.containsKey("hl.alternateField")) post.put("hl.alternateField", CollectionSchema.description.getSolrFieldName()); + if (!post.containsKey("hl.alternateField")) post.put("hl.alternateField", CollectionSchema.description_txt.getSolrFieldName()); if (!post.containsKey("hl.simple.pre")) post.put("hl.simple.pre", ""); if (!post.containsKey("hl.simple.post")) post.put("hl.simple.post", ""); if (!post.containsKey("hl.fragsize")) post.put("hl.fragsize", Integer.toString(SearchEvent.SNIPPET_MAX_LENGTH)); diff --git a/htroot/yacysearch_location.java b/htroot/yacysearch_location.java index cb101934c..3112a7aaf 100644 --- a/htroot/yacysearch_location.java +++ b/htroot/yacysearch_location.java @@ -104,7 +104,7 @@ public class yacysearch_location { prop.put("kml_placemark_" + placemarkCounter + "_author", message.getAuthor()); prop.put("kml_placemark_" + placemarkCounter + "_copyright", message.getCopyright()); prop.put("kml_placemark_" + placemarkCounter + "_subject", message.getSubject()); - prop.put("kml_placemark_" + placemarkCounter + "_description", message.getDescription()); + prop.put("kml_placemark_" + placemarkCounter + "_description", message.getDescriptions().size() > 0 ? message.getDescriptions().get(0) : ""); prop.put("kml_placemark_" + placemarkCounter + "_date", message.getPubDate()); prop.putXML("kml_placemark_" + placemarkCounter + "_url", message.getLink()); prop.put("kml_placemark_" + placemarkCounter + "_pointname", message.getTitle()); diff --git a/source/net/yacy/cora/document/Hit.java b/source/net/yacy/cora/document/Hit.java index 2a2d78a82..a8b5bc2b0 100644 --- a/source/net/yacy/cora/document/Hit.java +++ b/source/net/yacy/cora/document/Hit.java @@ -27,6 +27,7 @@ package net.yacy.cora.document; import java.util.Date; +import java.util.List; public interface Hit { @@ -70,7 +71,7 @@ public interface Hit { public String getLanguage(); - public String getDescription(); + public List getDescriptions(); public Date getPubDate(); diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java index 4f4ed6d94..1e6c52342 100644 --- a/source/net/yacy/cora/document/RSSMessage.java +++ b/source/net/yacy/cora/document/RSSMessage.java @@ -25,11 +25,13 @@ package net.yacy.cora.document; import java.text.ParseException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; @@ -158,8 +160,11 @@ public class RSSMessage implements Hit, Comparable, Comparator getDescriptions() { + List ds = new ArrayList(); + String d = Token.description.valueFrom(this.map, ""); + if (d.length() > 0) ds.add(d); + return ds; } @Override @@ -216,7 +221,7 @@ public class RSSMessage implements Hit, Comparable, Comparator 0 ? " MIME=\"" + mime + "\"" : "") + ">"); writer.write(lb); //List texts = new ArrayList(); - String description = ""; + List descriptions = new ArrayList(); int size = 0; boolean title_written = false; // the solr index may contain several; we take only the first which should be the visible tag in for (IndexableField value: fields) { @@ -264,9 +265,9 @@ public class GSAResponseWriter implements QueryResponseWriter { title_written = true; continue; } - if (CollectionSchema.description.getSolrFieldName().equals(fieldName)) { - description = value.stringValue(); - //texts.add(description); + if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) { + descriptions.add(value.stringValue()); + //texts.adds(description); continue; } if (CollectionSchema.last_modified.getSolrFieldName().equals(fieldName)) { @@ -290,8 +291,8 @@ public class GSAResponseWriter implements QueryResponseWriter { } // compute snippet from texts List snippet = urlhash == null ? null : snippets.get(urlhash); - OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? description : snippet.get(0)); - OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(), description); + OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : snippet.get(0)) : snippet.get(0)); + OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(), descriptions.size() > 0 ? descriptions.get(0) : ""); writer.write(""); if (YaCyVer == null) YaCyVer = yacyVersion.thisVersion().getName() + "/" + Switchboard.getSwitchboard().peers.mySeed().hash; OpensearchResponseWriter.solitaireTag(writer, GSAToken.ENT_SOURCE.name(), YaCyVer); diff --git a/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java index d04deb1f6..3a266d43b 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java @@ -59,7 +59,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter { // pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching private static final CollectionSchema[] extrafields = new CollectionSchema[]{ - CollectionSchema.id, CollectionSchema.title, CollectionSchema.description, CollectionSchema.text_t, + CollectionSchema.id, CollectionSchema.title, CollectionSchema.description_txt, CollectionSchema.text_t, CollectionSchema.h1_txt, CollectionSchema.h2_txt, CollectionSchema.h3_txt, CollectionSchema.h4_txt, CollectionSchema.h5_txt, CollectionSchema.h6_txt, }; static final Set SOLR_FIELDS = new HashSet(); @@ -163,7 +163,8 @@ public class OpensearchResponseWriter implements QueryResponseWriter { List fields = doc.getFields(); int fieldc = fields.size(); List texts = new ArrayList(); - String description = "", title = ""; + List descriptions = new ArrayList(); + String title = ""; for (int j = 0; j < fieldc; j++) { IndexableField value = fields.get(j); String fieldName = value.name(); @@ -204,8 +205,9 @@ public class OpensearchResponseWriter implements QueryResponseWriter { solitaireTag(writer, RSSMessage.Token.pubDate.name(), HeaderFramework.formatRFC1123(d)); continue; } - if (CollectionSchema.description.getSolrFieldName().equals(fieldName)) { - description = value.stringValue(); + if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) { + String description = value.stringValue(); + descriptions.add(description); solitaireTag(writer, DublinCore.Description.getURIref(), description); texts.add(description); continue; @@ -233,10 +235,17 @@ public class OpensearchResponseWriter implements QueryResponseWriter { solitaireTag(writer, RSSMessage.Token.title.name(), title.length() == 0 ? (texts.size() == 0 ? "" : texts.get(0)) : title); List snippet = urlhash == null ? null : snippets.get(urlhash); String tagname = RSSMessage.Token.description.name(); - writer.write("<"); writer.write(tagname); writer.write('>'); - XML.escapeCharData(snippet == null || snippet.size() == 0 ? description : snippet.get(0), writer); - writer.write("\n"); - + if (snippet == null || snippet.size() == 0) { + for (String d: descriptions) { + writer.write("<"); writer.write(tagname); writer.write('>'); + XML.escapeCharData(snippet == null || snippet.size() == 0 ? d : snippet.get(0), writer); + writer.write("\n"); + } + } else { + writer.write("<"); writer.write(tagname); writer.write('>'); + XML.escapeCharData(snippet.get(0), writer); + writer.write("\n"); + } // open: where do we get the subject? //solitaireTag(writer, DublinCore.Subject.getURIref(), ""); // TODO: fill with actual data diff --git a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java index 7dbc37b04..edd1f2a66 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java @@ -136,7 +136,8 @@ public class YJsonResponseWriter implements QueryResponseWriter { List texts = new ArrayList(); MultiProtocolURI url = null; String urlhash = null; - String description = "", title = ""; + List descriptions = new ArrayList(); + String title = ""; StringBuilder path = new StringBuilder(80); for (int j = 0; j < fieldc; j++) { IndexableField value = fields.get(j); @@ -166,8 +167,9 @@ public class YJsonResponseWriter implements QueryResponseWriter { texts.add(title); continue; } - if (CollectionSchema.description.getSolrFieldName().equals(fieldName)) { - description = value.stringValue(); + if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) { + String description = value.stringValue(); + descriptions.add(description); texts.add(description); continue; } @@ -212,7 +214,7 @@ public class YJsonResponseWriter implements QueryResponseWriter { solitaireTag(writer, "path", path.toString()); solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title); List snippet = urlhash == null ? null : snippets.get(urlhash); - writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? description : snippet.get(0))); writer.write("\"\n}\n"); + writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : snippet.get(0))); writer.write("\"\n}\n"); if (i < responseCount - 1) { writer.write(",\n".toCharArray()); } diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 3b9460838..82df795dd 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -527,7 +527,7 @@ public class CrawlQueues { ASCII.getBytes(hash), url, (referrer == null) ? null : referrer.hash(), - item.getDescription(), + item.getDescriptions().size() > 0 ? item.getDescriptions().get(0) : "", loaddate, this.sb.crawler.defaultRemoteProfile.handle(), 0, diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index 85a0c7c06..73d2f504b 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -90,11 +90,11 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle final Map words = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false).words(); // generate potential tags from document title, description and subject - final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32; + final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; final StringBuilder buffer = new StringBuilder(bufferSize); final StringBuilder pwords = new StringBuilder(1000); buffer.append(document.dc_title().toLowerCase()); - buffer.append(document.dc_description().toLowerCase()); + for (String s:document.dc_description()) buffer.append(s.toLowerCase()); buffer.append(document.dc_subject(' ').toLowerCase()); final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib); try { diff --git a/source/net/yacy/data/ymark/YMarkEntry.java b/source/net/yacy/data/ymark/YMarkEntry.java index d29802a7f..0ed54adc8 100644 --- a/source/net/yacy/data/ymark/YMarkEntry.java +++ b/source/net/yacy/data/ymark/YMarkEntry.java @@ -166,7 +166,7 @@ public class YMarkEntry extends TreeMap { public YMarkEntry(final DCEntry dc) { super(); for (BOOKMARK b : BOOKMARK.values()) { - if(dc.containsKey(b.dc_attrb)) { + if (dc.getMap().containsKey(b.dc_attrb)) { this.put(b.key(), dc.get(b.dc_attrb)); } } @@ -218,7 +218,7 @@ public class YMarkEntry extends TreeMap { final DCEntry dc = new DCEntry(); for (BOOKMARK b : BOOKMARK.values()) { if(!b.dc_attrb.isEmpty() && this.containsKey(b.key())) { - dc.put(b.dc_attrb, this.get(b.key())); + dc.getMap().put(b.dc_attrb, new String[]{this.get(b.key())}); } } return dc; diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java index e14812d4f..7c0ba3d19 100644 --- a/source/net/yacy/data/ymark/YMarkMetadata.java +++ b/source/net/yacy/data/ymark/YMarkMetadata.java @@ -132,7 +132,7 @@ public class YMarkMetadata { metadata.put(METADATA.CREATOR, this.document.dc_creator()); metadata.put(METADATA.KEYWORDS, this.document.dc_subject(' ')); metadata.put(METADATA.PUBLISHER, this.document.dc_publisher()); - metadata.put(METADATA.DESCRIPTION, this.document.dc_description()); + metadata.put(METADATA.DESCRIPTION, this.document.dc_description().length > 0 ? this.document.dc_description()[0] : ""); metadata.put(METADATA.MIMETYPE, this.document.dc_format()); metadata.put(METADATA.LANGUAGE, this.document.dc_language()); metadata.put(METADATA.CHARSET, this.document.getCharset()); diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index c8b74914f..86dde3561 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -129,7 +129,9 @@ public final class Condenser { // phrase 99 is taken from the media Link url and anchor description // phrase 100 and above are lines from the text insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib); - insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); + for (String description: document.dc_description()) { + insertTextToWords(new SentenceReader(description), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); + } insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 08fce3550..9d4437a46 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -75,8 +75,8 @@ public class Document { private List titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result private final StringBuilder creator; // author or copyright private final String publisher; // publisher - private final List sections; // if present: more titles/headlines appearing in the document - private final StringBuilder description; // an abstract, if present: short content description + private final List sections; // if present: more titles/headlines appearing in the document + private final List descriptions; // an abstract, if present: short content description private Object text; // the clear text, all that is visible private final Map anchors; // all links embedded as clickeable entities (anchor tags) private final Map rss; // all embedded rss feeds @@ -101,7 +101,7 @@ public class Document { final String[] keywords, final List titles, final String author, final String publisher, - final String[] sections, final String abstrct, + final String[] sections, final List abstrcts, final double lon, final double lat, final Object text, final Map anchors, @@ -118,7 +118,7 @@ public class Document { this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); this.sections = new LinkedList() ; if (sections != null) this.sections.addAll(Arrays.asList(sections)); - this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct); + this.descriptions = (abstrcts == null) ? new ArrayList() : abstrcts; if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) { this.lon = lon; this.lat = lat; @@ -288,10 +288,9 @@ dc_rights return sb.substring(0, sb.length() - 1); } - public String dc_description() { - if (this.description == null) - return dc_title(); - return this.description.toString(); + public String[] dc_description() { + if (descriptions == null) return new String[0]; + return this.descriptions.toArray(new String[this.descriptions.size()]); } public String dc_publisher() { @@ -646,9 +645,7 @@ dc_rights this.sections.addAll(doc.sections); this.titles.addAll(doc.titles()); this.keywords.addAll(doc.getKeywords()); - - if (this.description.length() > 0) this.description.append('\n'); - this.description.append(doc.dc_description()); + for (String d: doc.dc_description()) this.descriptions.add(d); if (!(this.text instanceof ByteArrayOutputStream)) { this.text = new ByteArrayOutputStream(); @@ -779,7 +776,7 @@ dc_rights final StringBuilder authors = new StringBuilder(80); final StringBuilder publishers = new StringBuilder(80); final StringBuilder subjects = new StringBuilder(80); - final StringBuilder description = new StringBuilder(80); + final List descriptions = new ArrayList(); final Collection titles = new LinkedHashSet(); final Collection sectionTitles = new LinkedHashSet(); final Map anchors = new HashMap(); @@ -810,9 +807,7 @@ dc_rights titles.addAll(doc.titles()); sectionTitles.addAll(Arrays.asList(doc.getSectionTitles())); - - if (description.length() > 0) description.append("\n"); - description.append(doc.dc_description()); + for (String d: doc.dc_description()) descriptions.add(d); if (doc.getTextLength() > 0) { if (docTextLength > 0) content.write('\n'); @@ -851,7 +846,7 @@ dc_rights authors.toString(), publishers.toString(), sectionTitles.toArray(new String[sectionTitles.size()]), - description.toString(), + descriptions, lon, lat, content.getBytes(), anchors, diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index e7529d329..b7253eac4 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -37,12 +37,14 @@ import java.util.List; import java.util.Locale; import java.util.TreeMap; +import org.apache.solr.common.params.MultiMapSolrParams; + import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; -public class DCEntry extends TreeMap { +public class DCEntry extends MultiMapSolrParams { private static final long serialVersionUID = -2050291583515701559L; @@ -55,7 +57,7 @@ public class DCEntry extends TreeMap { public static final DCEntry poison = new DCEntry(); public DCEntry() { - super((Collator) insensitiveCollator.clone()); + super(new TreeMap((Collator) insensitiveCollator.clone())); } public DCEntry( @@ -67,14 +69,14 @@ public class DCEntry extends TreeMap { double lat, double lon ) { - super((Collator) insensitiveCollator.clone()); - this.put("dc:identifier", url.toNormalform(true)); - this.put("dc:date", ISO8601Formatter.FORMATTER.format(date)); - this.put("dc:title", title); - this.put("dc:creator", author); - this.put("dc:description", body); - this.put("geo:lat", Double.toString(lat)); - this.put("geo:long", Double.toString(lon)); + super(new TreeMap((Collator) insensitiveCollator.clone())); + this.getMap().put("dc:identifier", new String[]{url.toNormalform(true)}); + this.getMap().put("dc:date", new String[]{ISO8601Formatter.FORMATTER.format(date)}); + this.getMap().put("dc:title", new String[]{title}); + this.getMap().put("dc:creator", new String[]{author}); + this.getMap().put("dc:description", new String[]{body}); + this.getMap().put("geo:lat", new String[]{Double.toString(lat)}); + this.getMap().put("geo:long", new String[]{Double.toString(lon)}); } /* @@ -222,14 +224,12 @@ public class DCEntry extends TreeMap { return t; } - public String getDescription() { - String t = this.get("body"); - if (t == null) t = this.get("dc:description"); - if (t == null) t = this.get("dc:subject"); - if (t == null) t = this.get("categories"); - t = stripCDATA(t); - if (t == null) return ""; - return t; + public List getDescriptions() { + String[] t = this.getParams("dc:description"); + List descriptions = new ArrayList(); + if (t == null) return descriptions; + for (String s: t) descriptions.add(stripCDATA(s)); + return descriptions; } public String[] getSubject() { @@ -280,9 +280,9 @@ public class DCEntry extends TreeMap { getCreator(), getPublisher(), null, - "", + getDescriptions(), getLon(), getLat(), - getDescription(), + "", null, null, null, diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java index d33c62ae9..3e0199d25 100644 --- a/source/net/yacy/document/content/SurrogateReader.java +++ b/source/net/yacy/document/content/SurrogateReader.java @@ -169,7 +169,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { //System.out.println("BUFFER-SIZE=" + buffer.length()); final String value = buffer.toString().trim(); if (this.elementName != null) { - this.surrogate.put(this.elementName, value); + this.surrogate.getMap().put(this.elementName, new String[]{value}); } this.buffer.setLength(0); this.parsingValue = false; @@ -179,9 +179,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable { value.replaceAll(";", ","); String oldcontent = this.surrogate.get(this.elementName); if (oldcontent == null) { - this.surrogate.put(this.elementName, value); + this.surrogate.getMap().put(this.elementName, new String[]{value}); } else { - this.surrogate.put(this.elementName, oldcontent + ";" + value); + this.surrogate.getMap().put(this.elementName, new String[]{oldcontent + ";" + value}); } } this.buffer.setLength(0); @@ -222,7 +222,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { System.out.println("Publisher: " + s.getPublisher()); System.out.println("URL: " + s.getIdentifier(true)); System.out.println("Language: " + s.getLanguage()); - System.out.println("Body: " + s.getDescription()); + System.out.println("Body: " + s.getDescriptions().toString()); } } catch (final IOException e) { ConcurrentLog.logException(e); diff --git a/source/net/yacy/document/parser/audioTagParser.java b/source/net/yacy/document/parser/audioTagParser.java index 4395d3d6a..aa5f3131d 100644 --- a/source/net/yacy/document/parser/audioTagParser.java +++ b/source/net/yacy/document/parser/audioTagParser.java @@ -124,44 +124,36 @@ public class audioTagParser extends AbstractParser implements Parser { titles.add(filename); // text + final List descriptions = new ArrayList(7); final StringBuilder text = new StringBuilder(500); final char space = ' '; - text.append(tag.getFirst(FieldKey.ARTIST)); - text.append(space); - text.append(tag.getFirst(FieldKey.ALBUM)); - text.append(space); - text.append(tag.getFirst(FieldKey.TITLE)); - text.append(space); - text.append(tag.getFirst(FieldKey.COMMENT)); - text.append(space); - text.append(tag.getFirst(FieldKey.LYRICS)); - text.append(space); - text.append(tag.getFirst(FieldKey.TAGS)); - text.append(space); - text.append(tag.getFirst(FieldKey.GENRE)); - text.append(space); + String field = tag.getFirst(FieldKey.ARTIST); + descriptions.add(FieldKey.ARTIST.name() + ": " + field); + text.append(field); text.append(space); + field = tag.getFirst(FieldKey.ALBUM); + descriptions.add(FieldKey.ALBUM.name() + ": " + field); + text.append(field); text.append(space); + field = tag.getFirst(FieldKey.TITLE); + descriptions.add(FieldKey.TITLE.name() + ": " + field); + text.append(field); text.append(space); + field = tag.getFirst(FieldKey.COMMENT); + descriptions.add(FieldKey.COMMENT.name() + ": " + field); + text.append(field); text.append(space); + field = tag.getFirst(FieldKey.LYRICS); + descriptions.add(FieldKey.LYRICS.name() + ": " + field); + text.append(field); text.append(space); + field = tag.getFirst(FieldKey.TAGS); + descriptions.add(FieldKey.TAGS.name() + ": " + field); + text.append(field); text.append(space); + field = tag.getFirst(FieldKey.GENRE); + descriptions.add(FieldKey.GENRE.name() + ": " + field); + text.append(field); text.append(space); text.append(location.toTokens()); // dc:subject final String[] subject = new String[1]; subject[0] = tag.getFirst(FieldKey.GENRE); - // description - final StringBuilder desc = new StringBuilder(500); - final String sep = " - "; - int count = desc.length(); - desc.append(tag.getFirst(FieldKey.ARTIST)); - if(desc.length() > count) { - desc.append(sep); - count = desc.length(); - } - desc.append(tag.getFirst(FieldKey.ALBUM)); - if(desc.length() > count) { - desc.append(sep); - count = desc.length(); - } - desc.append(tag.getFirst(FieldKey.TITLE)); - docs = new Document[]{new Document( location, mime, @@ -173,7 +165,7 @@ public class audioTagParser extends AbstractParser implements Parser { tag.getFirst(FieldKey.ARTIST), // author location.getHost(), // publisher null, // sections - desc.toString(), // abstrct + descriptions, // abstrct 0.0f, 0.0f, // lon, lat text.toString(), // text null, diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index cf25f216c..8cea42843 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -590,12 +590,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } - if (this.titles.size() == 0) { - // take description tag - s = getDescription(); - if (!s.isEmpty()) this.titles.add(s); - } - // extract headline from file name ArrayList t = new ArrayList(); t.addAll(this.titles); @@ -768,11 +762,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { return false; } - public String getDescription() { + public List getDescriptions() { String s = this.metas.get("description"); if (s == null) s = this.metas.get("dc.description"); - if (s == null) return EMPTY_STRING; - return s; + List descriptions = new ArrayList(); + if (s == null) return descriptions; + descriptions.add(s); + return descriptions; } public String getContentType() { diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index a8d84a39b..99b063402 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -135,7 +135,7 @@ public class htmlParser extends AbstractParser implements Parser { scraper.getAuthor(), scraper.getPublisher(), sections, - scraper.getDescription(), + scraper.getDescriptions(), scraper.getLon(), scraper.getLat(), scraper.getText(), scraper.getAnchors(), diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 05473f5c5..f8915e9a8 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -34,9 +34,11 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Properties; import java.util.Set; @@ -100,7 +102,7 @@ public class genericImageParser extends AbstractParser implements Parser { String title = null; String author = null; String keywords = null; - String description = null; + List descriptions = new ArrayList(); String filename = location.getFileName(); String ext = MultiProtocolURI.getFileExtension(filename); double gpslat = 0; @@ -179,10 +181,11 @@ public class genericImageParser extends AbstractParser implements Parser { if (keywords == null || keywords.isEmpty()) keywords = props.get("Category"); if (keywords == null || keywords.isEmpty()) keywords = props.get("Supplemental Category(s)"); - description = props.get("Caption/Abstract"); - if (description == null || description.isEmpty()) description = props.get("Country/Primary Location"); - if (description == null || description.isEmpty()) description = props.get("Province/State"); - if (description == null || description.isEmpty()) description = props.get("Copyright Notice"); + String description; + description = props.get("Caption/Abstract"); if (description != null && description.length() > 0) descriptions.add("Abstract: " + description); + description = props.get("Country/Primary Location"); if (description != null && description.length() > 0) descriptions.add("Location: " + description); + description = props.get("Province/State"); if (description != null && description.length() > 0) descriptions.add("State: " + description); + description = props.get("Copyright Notice"); if (description != null && description.length() > 0) descriptions.add("Copyright: " + description); } catch (final JpegProcessingException e) { //Log.logException(e); @@ -212,7 +215,7 @@ public class genericImageParser extends AbstractParser implements Parser { author == null ? "" : author, // author location.getHost(), // Publisher new String[]{}, // sections - description == null ? "" : description, // description + descriptions, // description gpslon, gpslat, // location infoString, // content text anchors, // anchors diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index ed44c490d..ccf5a3e44 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -29,8 +29,10 @@ package net.yacy.document.parser; import java.io.File; import java.io.InputStream; +import java.util.ArrayList; import java.util.Enumeration; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; @@ -176,6 +178,8 @@ public class odtParser extends AbstractParser implements Parser { // create the parser document Document[] docs = null; final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString()); + List descriptions = new ArrayList(); + if (docDescription != null && docDescription.length() > 0) descriptions.add(docDescription); docs = new Document[]{new Document( location, mimeType, @@ -187,7 +191,7 @@ public class odtParser extends AbstractParser implements Parser { docAuthor, "", null, - docDescription, + descriptions, 0.0f, 0.0f, contentBytes, null, diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 96688d225..5f7ca2a21 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -29,8 +29,10 @@ package net.yacy.document.parser; import java.io.File; import java.io.InputStream; +import java.util.ArrayList; import java.util.Enumeration; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; @@ -161,6 +163,8 @@ public class ooxmlParser extends AbstractParser implements Parser { // create the parser document Document[] docs = null; final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString()); + List descriptions = new ArrayList(); + if (docDescription != null && docDescription.length() > 0) descriptions.add(docDescription); docs = new Document[]{new Document( location, mimeType, @@ -172,7 +176,7 @@ public class ooxmlParser extends AbstractParser implements Parser { docAuthor, "", null, - docDescription, + descriptions, 0.0f, 0.0f, contentBytes, null, diff --git a/source/net/yacy/document/parser/rdfParser.java b/source/net/yacy/document/parser/rdfParser.java index 4d495ca08..69e0ab344 100644 --- a/source/net/yacy/document/parser/rdfParser.java +++ b/source/net/yacy/document/parser/rdfParser.java @@ -59,7 +59,7 @@ public class rdfParser extends AbstractParser implements Parser { String all = "rdfdatasource"; doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", - "", null, "", 0, 0, all, null, null, null, false); + "", null, new ArrayList(0), 0, 0, all, null, null, null, false); docs.add(doc); diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java index 45978122d..4ef55ba76 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java @@ -12,6 +12,7 @@ import java.io.InputStreamReader; import java.io.Reader; import java.net.MalformedURLException; import java.net.URL; +import java.util.ArrayList; import java.util.HashSet; import java.util.Set; @@ -78,7 +79,7 @@ public class RDFaParser extends AbstractParser implements Parser { } Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", - "", null, "", 0, 0, null, null, null, null, false); + "", null, new ArrayList(0), 0, 0, null, null, null, null, false); try { if (allTriples.length > 0) @@ -137,7 +138,7 @@ public class RDFaParser extends AbstractParser implements Parser { } Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", - "", null, "", 0, 0, all, null, null, null, false); + "", null, new ArrayList(0), 0, 0, all, null, null, null, false); return doc; } diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index e2d555ce7..809b2d2e8 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -95,7 +95,7 @@ public class rssParser extends AbstractParser implements Parser { item.getAuthor(), item.getCopyright(), new String[0], - item.getDescription(), + item.getDescriptions(), item.getLon(), item.getLat(), null, diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index 77b486fbd..ef734595e 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -90,7 +90,7 @@ public class sitemapParser extends AbstractParser implements Parser { "", "", new String[0], - "", + new ArrayList(), 0.0f, 0.0f, null, null, diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 2f974aa2d..b42a43b89 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -29,7 +29,9 @@ package net.yacy.document.parser; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Properties; @@ -76,7 +78,7 @@ public class swfParser extends AbstractParser implements Parser { String urlnr = null; final String linebreak = System.getProperty("line.separator"); final String[] sections = null; - final String abstrct = null; + final List abstrct = new ArrayList(); //TreeSet images = null; final Map anchors = new HashMap(); int urls = 0; diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index ea751c478..867c6ea25 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -32,9 +32,11 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; +import java.util.List; import java.util.Properties; import net.yacy.cora.document.UTF8; @@ -207,6 +209,7 @@ public class vcfParser extends AbstractParser implements Parser { final String[] sections = parsedNames.toArray(new String[parsedNames.size()]); final byte[] text = UTF8.getBytes(parsedDataText.toString()); + final List descriptions = new ArrayList(1); descriptions.add("vCard"); return new Document[]{new Document( url, // url of the source document mimeType, // the documents mime type @@ -218,7 +221,7 @@ public class vcfParser extends AbstractParser implements Parser { "", // TODO: AUTHOR "", // the publisher sections, // an array of section headlines - "vCard", // an abstract + descriptions, // an abstract 0.0f, 0.0f, text, // the parsed document text anchors, // a map of extracted anchors diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index 4cd1f747c..9eb458456 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -28,6 +28,8 @@ package net.yacy.document.parser; import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; @@ -90,16 +92,10 @@ public class vsdParser extends AbstractParser implements Parser { title = summary.getTitle(); } - String abstrct = null; - abstrct = ((contents.length() > 80)? contents.substring(0, 80) : contents.trim()). - replaceAll("\r\n"," "). - replaceAll("\n"," "). - replaceAll("\r"," "). - replaceAll("\t"," "); + List abstrct = new ArrayList(); + if (contents.length() > 0) abstrct.add(((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).replaceAll("\r\n"," ").replaceAll("\n"," ").replaceAll("\r"," ").replaceAll("\t"," ")); - if (title == null) { - title = abstrct; - } + if (title == null) title = location.toNormalform(true); // As the result of parsing this function must return a plasmaParserDocument object return new Document[]{new Document( diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 1ecce3742..c3125360e 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -312,8 +312,8 @@ public class URIMetadataNode { return getString(CollectionSchema.text_t); } - public String getDescription() { - return getString(CollectionSchema.description); + public ArrayList getDescription() { + return getStringList(CollectionSchema.description_txt); } public boolean isOlder(URIMetadataRow other) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 927f60efb..98b1a14b9 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2916,7 +2916,7 @@ public final class Switchboard extends serverSwitch { } final String title = scraper == null ? url.toNormalform(true) : scraper.dc_title(); - final String description = scraper.dc_description(); + final String description = scraper.dc_description().length > 0 ? scraper.dc_description()[0] : ""; // add the url to the crawl stack this.crawler.removePassive(handle); // if there is an old entry, delete it diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 2d220acdc..d1499dbd7 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -806,10 +806,11 @@ public final class Fulltext { } else { BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), - CollectionSchema.author.getSolrFieldName(), CollectionSchema.description.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); + CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); SolrDocument doc; ArrayList title; - String url, author, description, hash; + String url, author, hash; + String[] descriptions; Integer size; Date date; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { @@ -817,7 +818,7 @@ public final class Fulltext { url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); title = (ArrayList) doc.getFieldValue(CollectionSchema.title.getSolrFieldName()); author = (String) doc.getFieldValue(CollectionSchema.author.getSolrFieldName()); - description = (String) doc.getFieldValue(CollectionSchema.description.getSolrFieldName()); + descriptions = (String[]) doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()); size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; @@ -832,7 +833,9 @@ public final class Fulltext { if (title != null) pw.println("" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + ""); pw.println("" + MultiProtocolURI.escape(url) + ""); if (author != null && !author.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(author, true) + ""); - if (description != null && !description.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(description, true) + ""); + if (descriptions != null && descriptions.length > 0) { + for (String d: descriptions) pw.println("" + CharacterCoding.unicode2xml(d, true) + ""); + } if (date != null) pw.println("" + HeaderFramework.formatRFC1123(date) + ""); if (size != null) pw.println("" + size.intValue() + ""); pw.println("" + hash + ""); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 193aff404..604a4a769 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -625,7 +625,7 @@ public class Segment { if (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.host_id_s)) { uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ {CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b}, - {CollectionSchema.description, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) { + {CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) { CollectionSchema checkfield = checkfields[0]; CollectionSchema signaturefield = checkfields[1]; CollectionSchema uniquefield = checkfields[2]; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 6549201a4..b90989094 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -245,16 +245,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.title_words_val, cv); } - String description = md.snippet(); if (description == null) description = ""; - if (allAttr || contains(CollectionSchema.description)) add(doc, CollectionSchema.description, description); - if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, 1); + String description = md.snippet(); + boolean description_exist = description != null; + if (description == null) description = ""; + if (allAttr || contains(CollectionSchema.description_txt)) add(doc, CollectionSchema.description_txt, description_exist ? new String[]{description} : new String[0]); + if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, description_exist ? 1 : 0); if (allAttr || contains(CollectionSchema.description_chars_val)) { - Integer[] cv = new Integer[]{new Integer(description.length())}; - add(doc, CollectionSchema.description_chars_val, cv); + add(doc, CollectionSchema.description_chars_val, description_exist ? new Integer[]{new Integer(description.length())} : new Integer[0]); } if (allAttr || contains(CollectionSchema.description_words_val)) { - Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACE.split(description).length)}; - add(doc, CollectionSchema.description_words_val, cv); + add(doc, CollectionSchema.description_words_val, description_exist ? new Integer[]{new Integer(description.length() == 0 ? 0 : CommonPattern.SPACE.split(description).length)} : new Integer[0]); } String filename = digestURI.getFileName(); @@ -424,23 +424,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.title_words_val, cv); } - String description = document.dc_description(); - List descriptions = new ArrayList(); - for (String s: CommonPattern.NEWLINE.split(description)) descriptions.add(s); - if (allAttr || contains(CollectionSchema.description)) { - add(doc, CollectionSchema.description, description); - if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && description != null && description.length() > 0) { - add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(description)); + String[] descriptions = document.dc_description(); + if (allAttr || contains(CollectionSchema.description_txt)) { + add(doc, CollectionSchema.description_txt, descriptions); + if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && descriptions != null && descriptions.length > 0) { + add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(descriptions)); } } - if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.size()); + if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.length); if (allAttr || contains(CollectionSchema.description_chars_val)) { - ArrayList cv = new ArrayList(descriptions.size()); + ArrayList cv = new ArrayList(descriptions.length); for (String s: descriptions) cv.add(new Integer(s.length())); add(doc, CollectionSchema.description_chars_val, cv); } if (allAttr || contains(CollectionSchema.description_words_val)) { - ArrayList cv = new ArrayList(descriptions.size()); + ArrayList cv = new ArrayList(descriptions.length); for (String s: descriptions) cv.add(new Integer(CommonPattern.SPACE.split(s).length)); add(doc, CollectionSchema.description_words_val, cv); } diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index a65c714e7..8eb31fe6a 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -75,7 +75,7 @@ public enum CollectionSchema implements SchemaDeclaration { ip_s(SolrType.string, true, true, false, false, false, "ip of host of url (after DNS lookup)"), author(SolrType.text_general, true, true, false, false, true, "content of author-tag"), author_sxt(SolrType.string, true, true, true, false, false, "content of author-tag as copy-field from author. This is used for facet generation"), - description(SolrType.text_general, true, true, false, false, true, "content of description-tag"), + description_txt(SolrType.text_general, true, true, true, false, true, "content of description-tag(s)"), description_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"), description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false"), keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by space"), diff --git a/source/net/yacy/server/http/HTTPDFileHandler.java b/source/net/yacy/server/http/HTTPDFileHandler.java index 5e0d05165..08a82d617 100644 --- a/source/net/yacy/server/http/HTTPDFileHandler.java +++ b/source/net/yacy/server/http/HTTPDFileHandler.java @@ -489,7 +489,8 @@ public final class HTTPDFileHandler { File f; String size; long sz; - String headline, author, description, publisher; + String headline, author, publisher; + List descriptions; int images, links; ContentScraper scraper; for (final String element : list) { @@ -503,14 +504,14 @@ public final class HTTPDFileHandler { headline = t.size() > 0 ? t.iterator().next() : ""; author = scraper.getAuthor(); publisher = scraper.getPublisher(); - description = scraper.getDescription(); + descriptions = scraper.getDescriptions(); images = scraper.getImages().size(); links = scraper.getAnchors().size(); } else { headline = null; author = null; publisher = null; - description = null; + descriptions = null; images = 0; links = 0; } @@ -527,7 +528,11 @@ public final class HTTPDFileHandler { aBuffer.append("" + element + "
"); if (author != null && author.length() > 0) aBuffer.append("Author: " + author + "
"); if (publisher != null && publisher.length() > 0) aBuffer.append("Publisher: " + publisher + "
"); - if (description != null && description.length() > 0) aBuffer.append("Description: " + description + "
"); + if (descriptions != null && descriptions.size() > 0) { + for (String d: descriptions) { + aBuffer.append("Description: " + d + "
"); + } + } aBuffer.append(GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(f.lastModified())) + ", " + size + ((images > 0) ? ", " + images + " images" : "") + ((links > 0) ? ", " + links + " links" : "") + "
\n"); } }