From 52a9040ae6e14a247942716ab3b728231cdf3362 Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 13 Nov 2015 01:48:28 +0100 Subject: [PATCH] Sort out double keywords (dc_subject) early in parsed documents - by direct using Set vs. List - remove not neede String[] getter --- htroot/api/getpageinfo.java | 7 ++- htroot/api/getpageinfo_p.java | 7 ++- source/net/yacy/document/Document.java | 59 ++++++++++++------------- source/net/yacy/search/Switchboard.java | 2 +- 4 files changed, 36 insertions(+), 39 deletions(-) diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index bd5c9e7e7..4d999ec2a 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -114,12 +114,11 @@ public class getpageinfo { prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString()); // put keywords - final String list[] = scraper.dc_subject(); + final Set list = scraper.dc_subject(); int count = 0; for (final String element: list) { - final String tag = element; - if (!tag.equals("")) { - prop.putXML("tags_"+count+"_tag", tag); + if (!element.equals("")) { + prop.putXML("tags_"+count+"_tag", element); count++; } } diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index 1b7418ff3..f5269ad32 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -114,12 +114,11 @@ public class getpageinfo_p { prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString()); // put keywords - final String list[] = scraper.dc_subject(); + final Set list = scraper.dc_subject(); int count = 0; for (final String element: list) { - final String tag = element; - if (!tag.equals("")) { - prop.putXML("tags_"+count+"_tag", tag); + if (!element.equals("")) { + prop.putXML("tags_"+count+"_tag", element); count++; } } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index cca71e75c..54dcf21c3 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -46,7 +46,6 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -71,7 +70,7 @@ public class Document { private DigestURL source; // the source url private final String mimeType; // mimeType as taken from http header private final String charset; // the charset of the document - private final List keywords; // most resources provide a keyword field + private final Set keywords; // most resources provide a keyword field private List titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result private final StringBuilder creator; // author or copyright private final String publisher; // publisher @@ -115,7 +114,7 @@ public class Document { this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.charset = charset; this.parserObject = parserObject; - this.keywords = new LinkedList(); + this.keywords = new LinkedHashSet(); if (keywords != null) this.keywords.addAll(Arrays.asList(keywords)); this.titles = (titles == null) ? new ArrayList(1) : titles; this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); @@ -214,6 +213,10 @@ dc_coverage dc_rights */ + /** + * Get the main document title. This is the 1st in the list of titles. + * @return title_string (may return null or empty string) + */ public String dc_title() { return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next(); } @@ -222,6 +225,10 @@ dc_rights return this.titles; } + /** + * Sets the title of the document, replacing any existing titles. + * @param title + */ public void setTitle(final String title) { this.titles = new ArrayList(); if (title != null) this.titles.add(title); @@ -239,11 +246,8 @@ dc_rights * @param tags */ public void addTags(Set tags) { - for (String s: this.keywords) { - tags.remove(s); - } for (String s: tags) { - this.keywords.add(s); + if (s != null && !s.isEmpty()) this.keywords.add(s); } } @@ -274,28 +278,27 @@ dc_rights } return gf; } - - public String[] dc_subject() { - // sort out doubles and empty words - final TreeSet hs = new TreeSet(); - String s; - for (int i = 0; i < this.keywords.size(); i++) { - if (this.keywords.get(i) == null) continue; - s = (this.keywords.get(i)).trim(); - if (!s.isEmpty()) hs.add(s); - } - final String[] t = new String[hs.size()]; - int i = 0; - for (final String u: hs) t[i++] = u; - return t; + + /** + * Get the set of keywords associated with the document + * @return set of unique keywords + */ + public Set dc_subject() { + return this.keywords; } + /** + * Get the set of keywords associated with the document and string + * each keyword separated by the separator character + * + * @param separator character + * @return string of keywords or empty string + */ public String dc_subject(final char separator) { - final String[] t = dc_subject(); - if (t.length == 0) return ""; + if (this.keywords.size() == 0) return ""; // generate a new list - final StringBuilder sb = new StringBuilder(t.length * 8); - for (final String s: t) sb.append(s).append(separator); + final StringBuilder sb = new StringBuilder(this.keywords.size() * 8); + for (final String s: this.keywords) sb.append(s).append(separator); return sb.substring(0, sb.length() - 1); } @@ -427,10 +430,6 @@ dc_rights return sentences; } - public List getKeywords() { - return this.keywords; - } - public Collection getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map @@ -688,7 +687,7 @@ dc_rights for (final Document doc: docs) { this.sections.addAll(doc.sections); this.titles.addAll(doc.titles()); - this.keywords.addAll(doc.getKeywords()); + this.keywords.addAll(doc.dc_subject()); for (String d: doc.dc_description()) this.descriptions.add(d); if (!(this.text instanceof ByteArrayOutputStream)) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 9597b9438..25deaf73a 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3205,7 +3205,7 @@ public final class Switchboard extends serverSwitch { //final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart")); tags.add("crawlStart"); - final String[] keywords = scraper.dc_subject(); + final Set keywords = scraper.dc_subject(); if (keywords != null) { for (final String k: keywords) { final String kk = BookmarkHelper.cleanTagsString(k);