diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index e0b505459..ebefb1060 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -103,12 +103,12 @@ To see a list of all APIs, please visit the -->
Vocabulary Production - It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub. - This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term. - This works best with wikis. Try to use a wiki url as objectspace path. +
-
Vocabulary Name
(this will become the name of a search facet)
+
Vocabulary Name
+
+
this shall be a search facet (disable this for large vocabularies!)

Empty Vocabulary 
@@ -119,23 +119,31 @@ To see a list of all APIs, please visit the
+
+
It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub. + This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term. + This works best with wikis. Try to use a wiki url as objectspace path.
Objectspace
-
Import from a csv file 
+
Import from a csv file 
File Path
Column for Literals
(first has index 0)
+
Synonyms
+
no Synonyms
+ Auto-Enrich with Synonyms from Stemming Library
+ Read Column
+ (first has index 0) +
Column for Object Link (optional)
(first has index 0, if unused set -1)
Charset of Import File
-
Auto-Enrich with Synonyms from Stemming Library
-
@@ -155,6 +163,7 @@ To see a list of all APIs, please visit the

if set, uses the predicate
#[objectspacepredicate]# for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)# +
Is Facet?
(If checked, this vocabulary is used for search facets. Not feasible for large vocabularies!)
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 8d56eb999..038504c0b 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -26,6 +26,7 @@ import java.io.InputStreamReader; import java.net.MalformedURLException; import java.nio.charset.Charset; import java.util.Collection; +import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; @@ -72,18 +73,21 @@ public class Vocabulary_p { if (discoveruri == null) discoverobjectspace = ""; Map table = new LinkedHashMap(); File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); + final boolean isFacet = post.getBoolean("isFacet"); final boolean discoverNot = post.get("discovermethod", "").equals("none"); final boolean discoverFromPath = post.get("discovermethod", "").equals("path"); final boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv"); - final String discoverFromCSVPath = post.get("discoverpath", ""); + final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " "); final String discoverFromCSVCharset = post.get("charset", "UTF-8"); final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0); + final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1); final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1); final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null; - final boolean discoverenrichsynonyms = post.getBoolean("discoverenrichsynonyms"); + final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms"); + final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn"); Segment segment = sb.index; String t; if (!discoverNot) { @@ -91,23 +95,40 @@ public class Vocabulary_p { BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset)); String line = null; Pattern semicolon = Pattern.compile(";"); + Map synonym2literal = new HashMap<>(); // helper map to check if there are double synonyms while ((line = r.readLine()) != null) { if (line.length() == 0) continue; String[] l = semicolon.split(line); if (l.length == 0) l = new String[]{line}; String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral].trim(); if (literal == null) continue; - if (literal.length() > 0 && (literal.charAt(0) == '"' || literal.charAt(0) == '\'')) literal = literal.substring(1); - if (literal.length() > 0 && (literal.charAt(literal.length() - 1) == '"' || literal.charAt(literal.length() - 1) == '\'')) literal = literal.substring(0, literal.length() - 1); + literal = normalizeLiteral(literal); String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink].trim(); if (literal.length() > 0) { - String synonyms = Tagging.normalizeTerm(literal); + String synonyms = ""; if (discoverenrichsynonyms) { Set sy = SynonymLibrary.getSynonyms(literal); if (sy != null) { for (String s: sy) synonyms += "," + s; } + } else if (discoverreadcolumn) { + synonyms = discovercolumnsynonyms < 0 || l.length <= discovercolumnsynonyms ? null : l[discovercolumnsynonyms].trim(); + synonyms = normalizeLiteral(synonyms); + } else { + synonyms = Tagging.normalizeTerm(literal); } + // check double synonyms + if (synonyms.length() > 0) { + String oldliteral = synonym2literal.get(synonyms); + if (oldliteral != null) { + // replace old entry with combined new + table.remove(oldliteral); + String newliteral = oldliteral + "," + literal; + literal = newliteral; + } + synonym2literal.put(synonyms, literal); + } + // store term table.put(literal, new Tagging.SOTuple(synonyms, objectlink == null ? "" : objectlink)); } } @@ -160,6 +181,7 @@ public class Vocabulary_p { } } Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); + newvoc.setFacet(isFacet); LibraryProvider.autotagging.addVocabulary(newvoc); vocabularyName = discovername; vocabulary = newvoc; @@ -205,6 +227,11 @@ public class Vocabulary_p { vocabulary = null; vocabularyName = null; } + + // check the isFacet property + if (vocabulary != null && post.containsKey("isFacet")) { + vocabulary.setFacet(post.getBoolean("isFacet")); + } } } catch (final IOException e) { ConcurrentLog.logException(e); @@ -231,6 +258,7 @@ public class Vocabulary_p { prop.putHTML("edit_name", vocabulary.getName()); prop.putXML("edit_namexml", vocabulary.getName()); prop.putHTML("edit_namespace", vocabulary.getNamespace()); + prop.put("edit_isFacet", vocabulary.isFacet() ? 1 : 0); prop.put("edit_size", vocabulary.size()); prop.putHTML("edit_predicate", vocabulary.getPredicate()); prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX); @@ -279,4 +307,11 @@ public class Vocabulary_p { // return rewrite properties return prop; } + + private static String normalizeLiteral(String literal) { + if (literal == null) return ""; + if (literal.length() > 0 && (literal.charAt(0) == '"' || literal.charAt(0) == '\'')) literal = literal.substring(1); + if (literal.length() > 0 && (literal.charAt(literal.length() - 1) == '"' || literal.charAt(literal.length() - 1) == '\'')) literal = literal.substring(0, literal.length() - 1); + return literal; + } } diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java index add14db10..be1fb87d6 100644 --- a/source/net/yacy/cora/lod/vocabulary/Tagging.java +++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java @@ -53,6 +53,7 @@ public class Tagging { private final Map term2objectlink; private final Map> synonym2synonyms; private File propFile; + private boolean isFacet; // true if the vocabulary shall generate a navigation facet private String predicate, namespace, objectspace; @@ -99,6 +100,7 @@ public class Tagging { this.predicate = this.namespace + name; this.objectspace = null; this.propFile = null; + this.isFacet = true; } public Tagging(String name, File propFile) throws IOException { @@ -263,6 +265,14 @@ public class Tagging { } } + public boolean isFacet() { + return this.isFacet; + } + + public void setFacet(boolean isFacet) { + this.isFacet = isFacet; + } + public int size() { return this.term2objectlink.size(); } @@ -346,7 +356,7 @@ public class Tagging { public void setObjectspace(String os) throws IOException { if (this.propFile == null) return; - if (os == null || (this.objectspace != null && this.objectspace.equals(os))) return; + if (os == null || os.length() == 0 || (this.objectspace != null && this.objectspace.equals(os))) return; this.objectspace = os; File tmp = tmpFile(); BufferedWriter w = new BufferedWriter(new FileWriter(tmp)); diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 06fe6ef9e..0f5799daf 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -230,7 +230,11 @@ public final class QueryParams { // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield)) if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName()); } - for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); + for (Tagging v: LibraryProvider.autotagging.getVocabularies()) { + if (v.isFacet()) { + this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); + } + } this.maxfacets = defaultmaxfacets; this.cachedQuery = null; }