From b7417ac329d1327770c41bb3c9e46266f9dd536a Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 5 Apr 2017 00:08:25 +0200 Subject: [PATCH] Introduce a Keyword search navigator using the index field keywords. The keywords field string is split into words as navigator entries. A keyword navigator facet is essential for search appliance usage were documents and metadata use often specialized keyword vocabularies to filter search results. This navi can be used without custom index schema. As we don't have defined a search query command to filter "keywords" yet, the filtering is limited by adding the keyword to the search query. --- .../search/navigator/NavigatorPlugins.java | 5 ++ .../navigator/TokenizedStringNavigator.java | 77 +++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 source/net/yacy/search/navigator/TokenizedStringNavigator.java diff --git a/source/net/yacy/search/navigator/NavigatorPlugins.java b/source/net/yacy/search/navigator/NavigatorPlugins.java index 278ff1941..843ea1f59 100644 --- a/source/net/yacy/search/navigator/NavigatorPlugins.java +++ b/source/net/yacy/search/navigator/NavigatorPlugins.java @@ -48,6 +48,7 @@ public class NavigatorPlugins { defaultnavplugins.put("namespace", "Wiki Name Space"); defaultnavplugins.put("year", "Year"); // defaultnavplugins.put("year:dates_in_content_dts:Event","Event"); + defaultnavplugins.put("keywords", "Keywords"); return defaultnavplugins; } @@ -118,6 +119,10 @@ public class NavigatorPlugins { navigatorPlugins.put("year", new YearNavigator("Year", CollectionSchema.last_modified)); } } + + if (navname.contains("keywords")) { + navigatorPlugins.put("keywords", new TokenizedStringNavigator("Keywords", CollectionSchema.keywords)); + } } return navigatorPlugins; } diff --git a/source/net/yacy/search/navigator/TokenizedStringNavigator.java b/source/net/yacy/search/navigator/TokenizedStringNavigator.java new file mode 100644 index 000000000..2298ea37b --- /dev/null +++ b/source/net/yacy/search/navigator/TokenizedStringNavigator.java @@ -0,0 +1,77 @@ +/** + * TokenizedStringNavigator.java + * (C) 2017 by reger24; https://github.com/reger24 + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * If not, see . + */ +package net.yacy.search.navigator; + +import java.util.Collection; +import java.util.StringTokenizer; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.Switchboard; +import net.yacy.search.schema.CollectionSchema; + +/** + * Search navigator for string entries based on ScoreMap to count and + * order the result list by counted occurence. The string values are tokenized + * and each word is added (lowercased) to the score map. + */ +public class TokenizedStringNavigator extends StringNavigator implements Navigator { + + public TokenizedStringNavigator(String title, CollectionSchema field) { + super(title, field); + } + + /** + * Increase the score for the key value contained in the defined field in + * the doc. The value string is tokenized using delimiter " ,;" + * @param doc Solrdocument with field for the key content + */ + @Override + public void incDoc(URIMetadataNode doc) { + if (field != null) { + Object val = doc.getFieldValue(field.getSolrFieldName()); + if (val != null) { + if (val instanceof Collection) { + Collection ll = (Collection) val; + for (String s : ll) { + if (!s.isEmpty()) { + StringTokenizer token = new StringTokenizer(s.toLowerCase()," ,;"); // StringTokenizer faster than regex pattern + while (token.hasMoreTokens()) { + String word = token.nextToken(); + if (word.length() > 1 && !Switchboard.stopwords.contains(word)) { + this.inc(word); + } + } + } + } + } else { + StringTokenizer token = new StringTokenizer((String) val, " ,;"); + while (token.hasMoreTokens()) { + String word = token.nextToken().toLowerCase(); + if (word.length() > 1 && !Switchboard.stopwords.contains(word)) { + this.inc(word); + } + } + } + } + } + } +}