From ec9d02156852a6664d00ff1fa9f4c32f5f2f2c72 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 17 Nov 2014 14:22:40 +0100
Subject: [PATCH] added option in vocabulary editor to import CSV files with
 different encodings (preselected windows-type character encoding which is
 typical for CSV files). Fixed also other problems with character encoding in
 dictionary files. Automatically generated vocabularies are now also noted in
 the API steering.

---
 htroot/Vocabulary_p.html                      | 46 ++++++----
 htroot/Vocabulary_p.java                      | 84 ++++++++++++-------
 .../net/yacy/cora/lod/vocabulary/Tagging.java |  6 +-
 3 files changed, 90 insertions(+), 46 deletions(-)
diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html
index 44ca4dbde..d4418c2f0 100644
--- a/htroot/Vocabulary_p.html
+++ b/htroot/Vocabulary_p.html
@@ -90,7 +90,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
         <dd>
           <select name="vocabulary" onchange="this.form.submit()">
             #{vocabularyset}#
-            <option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>
+            <option value="#[name]#"#(selected)#:: selected="selected"#(/selected)#>#[name]#</option>
             #{/vocabularyset}#
           </select>
         </dd>
@@ -107,17 +107,35 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
       This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
       This works best with wikis. Try to use a wiki url as objectspace path.
       <dl>
-        <dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd>
-        <dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" />
-            <div id="searchresults"></div></dd>
-        <dt>Discover Terms:</dt>
+        <dt>Vocabulary Name<br><i>(this will become the name of a search facet)</i></dt>
+        <dd><input type="text" name="discovername" value="" size="78" maxlength="128" onkeydown="for (i=0; i < 6; i++) document.getElementsByName('discovermethod')[i].disabled=''"/></dd>
+        <hr>
+        <dt>Empty Vocabulary&nbsp;<input type="radio" name="discovermethod" value="none" checked="checked" disabled="disabled"/></dt>
+        <dd></dd>
+        <dt>Auto-Discover&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</br>
+            from file name&nbsp;<input type="radio" name="discovermethod" value="path" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
+            from page title&nbsp;<input type="radio" name="discovermethod" value="title" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
+            from page title (splitted)&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
+            from page author&nbsp;<input type="radio" name="discovermethod" value="author" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/></dt>
         <dd>
-        <input type="radio" name="discovermethod" value="none" checked="checked" /> no auto-discovery (empty vocabulary)&nbsp;&nbsp;
-        <input type="radio" name="discovermethod" value="path" /> from file name&nbsp;&nbsp;
-        <input type="radio" name="discovermethod" value="title" /> from page title&nbsp;&nbsp;
-        <input type="radio" name="discovermethod" value="titlesplitted" /> from page title (splitted)&nbsp;&nbsp;
-        <input type="radio" name="discovermethod" value="author" /> from page author</dd>
-        <input type="radio" name="discovermethod" value="csv" /> from a csv file</dd>
+            <dl>
+            <dt><i>Objectspace</i></dt>
+            <dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd>
+            </dl>
+        </dd>
+        <dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
+        <dd>
+            <dl>
+            <dt><i>File Path</i></dt>
+            <dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd>
+            <dt><i>Column for Literals</i></dt>
+            <dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
+            <dt><i>Column for Object Link (optional)</i></dt>
+            <dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
+            <dt><i>Charset of Import File</i></dt>
+            <dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
+            </dl>
+        </dd>
         <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
       </dl>
     </fieldset>
@@ -136,12 +154,6 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
         <dt>Prefix</dt><dd>#[prefix]#</dd>
         <dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
       </dl>
-      <p>This produces the following triples in the triplestore if a term or synonym matches in a document:</p>
-      <dl>
-        <dt>Triple #1</dt><dd>#[triple1]#</dd>
-        <dt>Triple #2</dt><dd>#[triple2]#</dd>
-        <dt>more Triples for linking into objectspace</dt><dd>#[tripleN]#</dd>
-      </dl>
       <table class="sortable" border="0">
       <tr class="TableHeader" valign="bottom">
         <td>Modify</td>
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java
index dccffef60..7abd40f1b 100644
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@@ -18,23 +18,26 @@
  *  If not, see <http://www.gnu.org/licenses/>.
  */
 
+import java.io.BufferedReader;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.net.MalformedURLException;
+import java.nio.charset.Charset;
 import java.util.Collection;
 import java.util.Iterator;
+import java.util.LinkedHashMap;
 import java.util.Map;
-import java.util.TreeMap;
 
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.lod.vocabulary.DCTerms;
-import net.yacy.cora.lod.vocabulary.Owl;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
-import net.yacy.cora.lod.vocabulary.YaCyMetadata;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.data.WorkTables;
 import net.yacy.document.LibraryProvider;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.search.Switchboard;
@@ -55,23 +58,43 @@ public class Vocabulary_p {
         if (vocabulary == null) vocabularyName = null;
         if (post != null) {
             try {
-                if (vocabulary == null) {
-                    // create a vocabulary
-                    if (discovername != null && discovername.length() > 0) {
-                        String discoverobjectspace = post.get("discoverobjectspace", "");
-                        MultiProtocolURL discoveruri = null;
-                        if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
-                        if (discoveruri == null) discoverobjectspace = "";
-                        Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
-                        File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
-                        boolean discoverNot = post.get("discovermethod", "").equals("none");
-                        boolean discoverFromPath = post.get("discovermethod", "").equals("path");
-                        boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
-                        boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
-                        boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
-                        Segment segment = sb.index;
-                        String t;
-                        if (!discoverNot) {
+                // create a vocabulary
+                if (vocabulary == null && discovername != null && discovername.length() > 0) {
+                    // store this call as api call
+                    sb.tables.recordAPICall(post, "Vocabulary_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "vocabulary creation for " + discovername);
+                    // get details of creation
+                    String discoverobjectspace = post.get("discoverobjectspace", "");
+                    MultiProtocolURL discoveruri = null;
+                    if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
+                    if (discoveruri == null) discoverobjectspace = "";
+                    Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
+                    File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
+                    final boolean discoverNot = post.get("discovermethod", "").equals("none");
+                    final boolean discoverFromPath = post.get("discovermethod", "").equals("path");
+                    final boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
+                    final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
+                    final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
+                    final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
+                    final String discoverFromCSVPath = post.get("discoverpath", "");
+                    final String discoverFromCSVCharset = post.get("charset", "UTF-8");
+                    final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
+                    final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
+                    final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
+                    Segment segment = sb.index;
+                    String t;
+                    if (!discoverNot) {
+                        if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
+                            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
+                            String line = null;
+                            while ((line = r.readLine()) != null) {
+                                String[] l = line.split(";");
+                                String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral];
+                                String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink];
+                                if (literal != null && literal.length() > 0) {
+                                    table.put(literal, new Tagging.SOTuple(Tagging.normalizeTerm(literal), objectlink == null ? "" : objectlink));
+                                }
+                            }
+                        } else {
                             Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
                             while (ui.hasNext()) {
                                 DigestURL u = ui.next();
@@ -118,11 +141,11 @@ public class Vocabulary_p {
                                 }
                             }
                         }
-                        Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
-                        LibraryProvider.autotagging.addVocabulary(newvoc);
-                        vocabularyName = discovername;
-                        vocabulary = newvoc;
                     }
+                    Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
+                    LibraryProvider.autotagging.addVocabulary(newvoc);
+                    vocabularyName = discovername;
+                    vocabulary = newvoc;
                 } else {
                     // check if objectspace was set
                     vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()));
@@ -186,7 +209,6 @@ public class Vocabulary_p {
         } else {
             prop.put("edit", 1);
             boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists();
-            String yacyurl = YaCyMetadata.hashURI("[hash]".getBytes());
             prop.put("edit_editable", editable ? 1 : 0);
             prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : "");
             prop.putHTML("edit_name", vocabulary.getName());
@@ -197,9 +219,6 @@ public class Vocabulary_p {
             prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
             prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace());
             prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate());
-            prop.putXML("edit_triple1", "<" + yacyurl + "> <" + vocabulary.getPredicate() + "> \"[discovered-tags-commaseparated]\"");
-            prop.putXML("edit_triple2", "<" + yacyurl + "> <" + Owl.SameAs.getPredicate() + "> <[document-url]>");
-            prop.putXML("edit_tripleN", vocabulary.getObjectspace() == null ? "none - missing objectspace" : "<" + yacyurl + "> <" + DCTerms.references.getPredicate() + "> \"[object-link]#[tag]\" .");
             int c = 0;
             boolean dark = false;
             int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length();
@@ -231,6 +250,15 @@ public class Vocabulary_p {
 
         }
 
+        // make charset list for import method selector
+        int c = 0;
+        for (String cs: Charset.availableCharsets().keySet()) {
+            prop.putHTML("create_charset_" + c + "_name", cs);
+            prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0);
+            c++;
+        }
+        prop.put("create_charset", c);
+        
         // return rewrite properties
         return prop;
     }
diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java
index 6d298743c..add14db10 100644
--- a/source/net/yacy/cora/lod/vocabulary/Tagging.java
+++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java
@@ -22,8 +22,10 @@ package net.yacy.cora.lod.vocabulary;
 
 import java.io.BufferedWriter;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.Map;
@@ -34,6 +36,7 @@ import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;
 
+import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.geo.GeoLocation;
 import net.yacy.cora.geo.Locations;
 import net.yacy.cora.storage.Files;
@@ -158,7 +161,8 @@ public class Tagging {
 			    }
 			}
         } else {
-	        BufferedWriter w = new BufferedWriter(new FileWriter(propFile));
+            //
+            BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(propFile), UTF8.charset.name()));
 	        if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n");
 	        for (Map.Entry<String, SOTuple> e: table.entrySet()) {
 	            String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV();