added new options to vocabulary editor:

- new switch 'isFacet' which causes that the usage of the vocabulary for search facets is enabled or disabled. This shall be used for large vocabularies sind searched in solr are extremely slow if facets for a large set of alternative terms are generated - new option to disable auto-enrichment from synonyms - new option to add synonyms from another column when importing from csv - automatically recognize double-occurrences in synonyms and bundling terms for such synonyms
10 years ago · 7bfc5b80cb
parent 87b53b3572
commit 7bfc5b80cb
4 changed files with 72 additions and 14 deletions
--- a/htroot/Vocabulary_p.html
+++ b/htroot/Vocabulary_p.html
@ -103,12 +103,12 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
    <!--<form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" onkeyup="xmlhttpPost(); return false;">-->
    <form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" >
      <fieldset><legend>Vocabulary Production</legend>
-      It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub.
-      This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
-      This works best with wikis. Try to use a wiki url as objectspace path.
+      
      <dl>
-        <dt>Vocabulary Name<br><i>(this will become the name of a search facet)</i></dt>
+        <dt>Vocabulary Name</dt>
        <dd><input type="text" name="discovername" value="" size="78" maxlength="128" onkeydown="for (i=0; i < 6; i++) document.getElementsByName('discovermethod')[i].disabled=''"/></dd>
+        <dt></dt>
+        <dd><input type="checkbox" name="isFacet" checked="checked"/> this shall be a search facet (disable this for large vocabularies!)</dd>
        <hr>
        <dt>Empty Vocabulary&nbsp;<input type="radio" name="discovermethod" value="none" checked="checked" disabled="disabled"/></dt>
        <dd></dd>
@ -119,23 +119,31 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
            from page author&nbsp;<input type="radio" name="discovermethod" value="author" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/></dt>
        <dd>
            <dl>
+            <dt></dt>
+            <dd>It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub.
+                This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
+                This works best with wikis. Try to use a wiki url as objectspace path.</dd>
            <dt><i>Objectspace</i></dt>
            <dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd>
            </dl>
        </dd>
-        <dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
+        <dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnsynonyms').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
        <dd>
            <dl>
            <dt><i>File Path</i></dt>
            <dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd>
            <dt><i>Column for Literals</i></dt>
            <dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
+            <dt><i>Synonyms</i></dt>
+            <dd><input type="radio" name="discoversynonymsmethod" id="discoversynonymsmethod" value="none" checked="checked"/> <i>no Synonyms</i><br/>
+                <input type="radio" name="discoversynonymsmethod" id="discoversynonymsmethod" value="enrichsynonyms"/> <i>Auto-Enrich with Synonyms from Stemming Library</i><br/>
+                <input type="radio" name="discoversynonymsmethod" id="discoversynonymsmethod" value="readcolumn" onclick="document.getElementById('discovercolumnsynonyms').value=parseInt(document.getElementById('discovercolumnliteral').value)+1;"/> <i>Read Column</i><br/>
+                <input type="number" id="discovercolumnsynonyms" name="discovercolumnsynonyms" min="-1" max="99" step="1" size="2" value="-1" style="width:50px;"> (first has index 0)
+            </dd>
            <dt><i>Column for Object Link (optional)</i></dt>
            <dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
            <dt><i>Charset of Import File</i></dt>
            <dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
-            <dt><i>Auto-Enrich with Synonyms from Stemming Library</i></dt>
-            <dd><input type="checkbox" name="discoverenrichsynonyms" id="discoverenrichsynonyms" checked="checked" /></dd>
            </dl>
        </dd>
        <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
@ -155,6 +163,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
        <dt>Predicate</dt><dd>#[predicate]#</dd>
        <dt>Prefix</dt><dd>#[prefix]#</dd>
        <dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
+        <dt>Is Facet?</dt><dd><input type="checkbox" name="isFacet"#(isFacet)#:: checked="checked"#(/isFacet)#/> (If checked, this vocabulary is used for search facets. Not feasible for large vocabularies!)</dd>
      </dl>
      <table class="sortable" border="0">
      <tr class="TableHeader" valign="bottom">
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -26,6 +26,7 @@ import java.io.InputStreamReader;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.util.Collection;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.Map;
@ -72,18 +73,21 @@ public class Vocabulary_p {
                    if (discoveruri == null) discoverobjectspace = "";
                    Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
                    File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
+                    final boolean isFacet = post.getBoolean("isFacet");
                    final boolean discoverNot = post.get("discovermethod", "").equals("none");
                    final boolean discoverFromPath = post.get("discovermethod", "").equals("path");
                    final boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
                    final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
                    final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
                    final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
-                    final String discoverFromCSVPath = post.get("discoverpath", "");
+                    final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
                    final String discoverFromCSVCharset = post.get("charset", "UTF-8");
                    final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
+                    final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1);
                    final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
                    final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
-                    final boolean discoverenrichsynonyms = post.getBoolean("discoverenrichsynonyms");
+                    final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms");
+                    final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn");
                    Segment segment = sb.index;
                    String t;
                    if (!discoverNot) {
@ -91,23 +95,40 @@ public class Vocabulary_p {
                            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
                            String line = null;
                            Pattern semicolon = Pattern.compile(";");
+                            Map<String, String> synonym2literal = new HashMap<>(); // helper map to check if there are double synonyms
                            while ((line = r.readLine()) != null) {
                                if (line.length() == 0) continue;
                                String[] l = semicolon.split(line);
                                if (l.length == 0) l = new String[]{line};
                                String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral].trim();
                                if (literal == null) continue;
-                                if (literal.length() > 0 && (literal.charAt(0) == '"' || literal.charAt(0) == '\'')) literal = literal.substring(1);
-                                if (literal.length() > 0 && (literal.charAt(literal.length() - 1) == '"' || literal.charAt(literal.length() - 1) == '\'')) literal = literal.substring(0, literal.length() - 1);
+                                literal = normalizeLiteral(literal);
                                String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink].trim();
                                if (literal.length() > 0) {
-                                    String synonyms = Tagging.normalizeTerm(literal);
+                                    String synonyms = "";
                                    if (discoverenrichsynonyms) {
                                        Set<String> sy = SynonymLibrary.getSynonyms(literal);
                                        if (sy != null) {
                                            for (String s: sy) synonyms += "," + s;
                                        }
+                                    } else if (discoverreadcolumn) {
+                                        synonyms = discovercolumnsynonyms < 0 || l.length <= discovercolumnsynonyms ? null : l[discovercolumnsynonyms].trim();
+                                        synonyms = normalizeLiteral(synonyms);
+                                    } else {
+                                        synonyms = Tagging.normalizeTerm(literal);
                                    }
+                                    // check double synonyms
+                                    if (synonyms.length() > 0) {
+                                        String oldliteral = synonym2literal.get(synonyms);
+                                        if (oldliteral != null) {
+                                            // replace old entry with combined new
+                                            table.remove(oldliteral);
+                                            String newliteral = oldliteral + "," + literal;
+                                            literal = newliteral;
+                                        }
+                                        synonym2literal.put(synonyms, literal);
+                                    }
+                                    // store term
                                    table.put(literal, new Tagging.SOTuple(synonyms, objectlink == null ? "" : objectlink));
                                }
                            }
@ -160,6 +181,7 @@ public class Vocabulary_p {
                        }
                    }
                    Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
+                    newvoc.setFacet(isFacet);
                    LibraryProvider.autotagging.addVocabulary(newvoc);
                    vocabularyName = discovername;
                    vocabulary = newvoc;
@ -205,6 +227,11 @@ public class Vocabulary_p {
                        vocabulary = null;
                        vocabularyName = null;
                    }
+                    
+                    // check the isFacet property
+                    if (vocabulary != null && post.containsKey("isFacet")) {
+                        vocabulary.setFacet(post.getBoolean("isFacet"));
+                    }
                }
            } catch (final IOException e) {
                ConcurrentLog.logException(e);
@ -231,6 +258,7 @@ public class Vocabulary_p {
            prop.putHTML("edit_name", vocabulary.getName());
            prop.putXML("edit_namexml", vocabulary.getName());
            prop.putHTML("edit_namespace", vocabulary.getNamespace());
+            prop.put("edit_isFacet", vocabulary.isFacet() ? 1 : 0);
            prop.put("edit_size", vocabulary.size());
            prop.putHTML("edit_predicate", vocabulary.getPredicate());
            prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
@ -279,4 +307,11 @@ public class Vocabulary_p {
        // return rewrite properties
        return prop;
    }
+    
+    private static String normalizeLiteral(String literal) {
+        if (literal == null) return "";
+        if (literal.length() > 0 && (literal.charAt(0) == '"' || literal.charAt(0) == '\'')) literal = literal.substring(1);
+        if (literal.length() > 0 && (literal.charAt(literal.length() - 1) == '"' || literal.charAt(literal.length() - 1) == '\'')) literal = literal.substring(0, literal.length() - 1);
+        return literal;
+    }
 }
--- a/source/net/yacy/cora/lod/vocabulary/Tagging.java
+++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java
@ -53,6 +53,7 @@ public class Tagging {
    private final Map<String, String> term2objectlink;
    private final Map<String, Set<String>> synonym2synonyms;
    private File propFile;
+    private boolean isFacet; // true if the vocabulary shall generate a navigation facet

    private String predicate, namespace, objectspace;

@ -99,6 +100,7 @@ public class Tagging {
        this.predicate = this.namespace + name;
        this.objectspace = null;
        this.propFile = null;
+        this.isFacet = true;
    }

    public Tagging(String name, File propFile) throws IOException {
@ -263,6 +265,14 @@ public class Tagging {
        }
    }

+    public boolean isFacet() {
+        return this.isFacet;
+    }
+    
+    public void setFacet(boolean isFacet) {
+        this.isFacet = isFacet;
+    }
+    
    public int size() {
        return this.term2objectlink.size();
    }
@ -346,7 +356,7 @@ public class Tagging {

    public void setObjectspace(String os) throws IOException {
        if (this.propFile == null) return;
-        if (os == null || (this.objectspace != null && this.objectspace.equals(os))) return;
+        if (os == null || os.length() == 0 || (this.objectspace != null && this.objectspace.equals(os))) return;
        this.objectspace = os;
        File tmp = tmpFile();
        BufferedWriter w = new BufferedWriter(new FileWriter(tmp));
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@ -230,7 +230,11 @@ public final class QueryParams {
            // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield))
            if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName());
        }
-        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
+        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
+            if (v.isFacet()) {
+                this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
+            }
+        }
        this.maxfacets = defaultmaxfacets;
        this.cachedQuery = null;
    }