added option in vocabulary editor to import CSV files with different

encodings (preselected windows-type character encoding which is typical for CSV files). Fixed also other problems with character encoding in dictionary files. Automatically generated vocabularies are now also noted in the API steering.
11 years ago · ec9d021568
parent b558433211
commit ec9d021568
3 changed files with 90 additions and 46 deletions
--- a/htroot/Vocabulary_p.html
+++ b/htroot/Vocabulary_p.html
@ -90,7 +90,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
        <dd>
          <select name="vocabulary" onchange="this.form.submit()">
            #{vocabularyset}#
-            <option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>
+            <option value="#[name]#"#(selected)#:: selected="selected"#(/selected)#>#[name]#</option>
            #{/vocabularyset}#
          </select>
        </dd>
@ -107,17 +107,35 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
      This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
      This works best with wikis. Try to use a wiki url as objectspace path.
      <dl>
-        <dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd>
-        <dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" />
-            <div id="searchresults"></div></dd>
-        <dt>Discover Terms:</dt>
+        <dt>Vocabulary Name<br><i>(this will become the name of a search facet)</i></dt>
+        <dd><input type="text" name="discovername" value="" size="78" maxlength="128" onkeydown="for (i=0; i < 6; i++) document.getElementsByName('discovermethod')[i].disabled=''"/></dd>
+        <hr>
+        <dt>Empty Vocabulary&nbsp;<input type="radio" name="discovermethod" value="none" checked="checked" disabled="disabled"/></dt>
+        <dd></dd>
+        <dt>Auto-Discover&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</br>
+            from file name&nbsp;<input type="radio" name="discovermethod" value="path" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
+            from page title&nbsp;<input type="radio" name="discovermethod" value="title" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
+            from page title (splitted)&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
+            from page author&nbsp;<input type="radio" name="discovermethod" value="author" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/></dt>
        <dd>
-        <input type="radio" name="discovermethod" value="none" checked="checked" /> no auto-discovery (empty vocabulary)&nbsp;&nbsp;
-        <input type="radio" name="discovermethod" value="path" /> from file name&nbsp;&nbsp;
-        <input type="radio" name="discovermethod" value="title" /> from page title&nbsp;&nbsp;
-        <input type="radio" name="discovermethod" value="titlesplitted" /> from page title (splitted)&nbsp;&nbsp;
-        <input type="radio" name="discovermethod" value="author" /> from page author</dd>
-        <input type="radio" name="discovermethod" value="csv" /> from a csv file</dd>
+            <dl>
+            <dt><i>Objectspace</i></dt>
+            <dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd>
+            </dl>
+        </dd>
+        <dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
+        <dd>
+            <dl>
+            <dt><i>File Path</i></dt>
+            <dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd>
+            <dt><i>Column for Literals</i></dt>
+            <dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
+            <dt><i>Column for Object Link (optional)</i></dt>
+            <dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
+            <dt><i>Charset of Import File</i></dt>
+            <dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
+            </dl>
+        </dd>
        <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
      </dl>
    </fieldset>
@ -136,12 +154,6 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
        <dt>Prefix</dt><dd>#[prefix]#</dd>
        <dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
      </dl>
-      <p>This produces the following triples in the triplestore if a term or synonym matches in a document:</p>
-      <dl>
-        <dt>Triple #1</dt><dd>#[triple1]#</dd>
-        <dt>Triple #2</dt><dd>#[triple2]#</dd>
-        <dt>more Triples for linking into objectspace</dt><dd>#[tripleN]#</dd>
-      </dl>
      <table class="sortable" border="0">
      <tr class="TableHeader" valign="bottom">
        <td>Modify</td>
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -18,23 +18,26 @@
 *  If not, see <http://www.gnu.org/licenses/>.
 */

+import java.io.BufferedReader;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.net.MalformedURLException;
+import java.nio.charset.Charset;
 import java.util.Collection;
 import java.util.Iterator;
+import java.util.LinkedHashMap;
 import java.util.Map;
-import java.util.TreeMap;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.lod.vocabulary.DCTerms;
-import net.yacy.cora.lod.vocabulary.Owl;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
-import net.yacy.cora.lod.vocabulary.YaCyMetadata;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.data.WorkTables;
 import net.yacy.document.LibraryProvider;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.search.Switchboard;
@ -55,23 +58,43 @@ public class Vocabulary_p {
        if (vocabulary == null) vocabularyName = null;
        if (post != null) {
            try {
-                if (vocabulary == null) {
-                    // create a vocabulary
-                    if (discovername != null && discovername.length() > 0) {
-                        String discoverobjectspace = post.get("discoverobjectspace", "");
-                        MultiProtocolURL discoveruri = null;
-                        if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
-                        if (discoveruri == null) discoverobjectspace = "";
-                        Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
-                        File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
-                        boolean discoverNot = post.get("discovermethod", "").equals("none");
-                        boolean discoverFromPath = post.get("discovermethod", "").equals("path");
-                        boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
-                        boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
-                        boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
-                        Segment segment = sb.index;
-                        String t;
-                        if (!discoverNot) {
+                // create a vocabulary
+                if (vocabulary == null && discovername != null && discovername.length() > 0) {
+                    // store this call as api call
+                    sb.tables.recordAPICall(post, "Vocabulary_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "vocabulary creation for " + discovername);
+                    // get details of creation
+                    String discoverobjectspace = post.get("discoverobjectspace", "");
+                    MultiProtocolURL discoveruri = null;
+                    if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
+                    if (discoveruri == null) discoverobjectspace = "";
+                    Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
+                    File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
+                    final boolean discoverNot = post.get("discovermethod", "").equals("none");
+                    final boolean discoverFromPath = post.get("discovermethod", "").equals("path");
+                    final boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
+                    final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
+                    final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
+                    final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
+                    final String discoverFromCSVPath = post.get("discoverpath", "");
+                    final String discoverFromCSVCharset = post.get("charset", "UTF-8");
+                    final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
+                    final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
+                    final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
+                    Segment segment = sb.index;
+                    String t;
+                    if (!discoverNot) {
+                        if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
+                            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
+                            String line = null;
+                            while ((line = r.readLine()) != null) {
+                                String[] l = line.split(";");
+                                String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral];
+                                String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink];
+                                if (literal != null && literal.length() > 0) {
+                                    table.put(literal, new Tagging.SOTuple(Tagging.normalizeTerm(literal), objectlink == null ? "" : objectlink));
+                                }
+                            }
+                        } else {
                            Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
                            while (ui.hasNext()) {
                                DigestURL u = ui.next();
@ -118,11 +141,11 @@ public class Vocabulary_p {
                                }
                            }
                        }
-                        Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
-                        LibraryProvider.autotagging.addVocabulary(newvoc);
-                        vocabularyName = discovername;
-                        vocabulary = newvoc;
                    }
+                    Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
+                    LibraryProvider.autotagging.addVocabulary(newvoc);
+                    vocabularyName = discovername;
+                    vocabulary = newvoc;
                } else {
                    // check if objectspace was set
                    vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()));
@ -186,7 +209,6 @@ public class Vocabulary_p {
        } else {
            prop.put("edit", 1);
            boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists();
-            String yacyurl = YaCyMetadata.hashURI("[hash]".getBytes());
            prop.put("edit_editable", editable ? 1 : 0);
            prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : "");
            prop.putHTML("edit_name", vocabulary.getName());
@ -197,9 +219,6 @@ public class Vocabulary_p {
            prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
            prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace());
            prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate());
-            prop.putXML("edit_triple1", "<" + yacyurl + "> <" + vocabulary.getPredicate() + "> \"[discovered-tags-commaseparated]\"");
-            prop.putXML("edit_triple2", "<" + yacyurl + "> <" + Owl.SameAs.getPredicate() + "> <[document-url]>");
-            prop.putXML("edit_tripleN", vocabulary.getObjectspace() == null ? "none - missing objectspace" : "<" + yacyurl + "> <" + DCTerms.references.getPredicate() + "> \"[object-link]#[tag]\" .");
            int c = 0;
            boolean dark = false;
            int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length();
@ -231,6 +250,15 @@ public class Vocabulary_p {

        }

+        // make charset list for import method selector
+        int c = 0;
+        for (String cs: Charset.availableCharsets().keySet()) {
+            prop.putHTML("create_charset_" + c + "_name", cs);
+            prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0);
+            c++;
+        }
+        prop.put("create_charset", c);
+        
        // return rewrite properties
        return prop;
    }
--- a/source/net/yacy/cora/lod/vocabulary/Tagging.java
+++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java
@ -22,8 +22,10 @@ package net.yacy.cora.lod.vocabulary;

 import java.io.BufferedWriter;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.Map;
@ -34,6 +36,7 @@ import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;

+import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.geo.GeoLocation;
 import net.yacy.cora.geo.Locations;
 import net.yacy.cora.storage.Files;
@ -158,7 +161,8 @@ public class Tagging {
 			    }
 			}
        } else {
-	        BufferedWriter w = new BufferedWriter(new FileWriter(propFile));
+            //
+            BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(propFile), UTF8.charset.name()));
 	        if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n");
 	        for (Map.Entry<String, SOTuple> e: table.entrySet()) {
 	            String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV();