added option in vocabulary editor to import CSV files with different

encodings (preselected windows-type character encoding which is typical for CSV files). Fixed also other problems with character encoding in dictionary files. Automatically generated vocabularies are now also noted in the API steering.
10 years ago · ec9d021568
parent b558433211
commit ec9d021568
3 changed files with 90 additions and 46 deletions
--- a/htroot/Vocabulary_p.html
+++ b/htroot/Vocabulary_p.html
@ -90,7 +90,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
        <dd>
          <select name="vocabulary" onchange="this.form.submit()">
            #{vocabularyset}#
-            <option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>
+            <option value="#[name]#"#(selected)#:: selected="selected"#(/selected)#>#[name]#</option>
            #{/vocabularyset}#
          </select>
        </dd>
@ -107,17 +107,35 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
      This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
      This works best with wikis. Try to use a wiki url as objectspace path.
      <dl>
-        <dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd>
+        <dt>Vocabulary Name<br><i>(this will become the name of a search facet)</i></dt>
-        <dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" />
+        <dd><input type="text" name="discovername" value="" size="78" maxlength="128" onkeydown="for (i=0; i < 6; i++) document.getElementsByName('discovermethod')[i].disabled=''"/></dd>
-            <div id="searchresults"></div></dd>
+        <hr>
-        <dt>Discover Terms:</dt>
+        <dt>Empty Vocabulary&nbsp;<input type="radio" name="discovermethod" value="none" checked="checked" disabled="disabled"/></dt>
        <dd></dd>
        <dt>Auto-Discover&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</br>
            from file name&nbsp;<input type="radio" name="discovermethod" value="path" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
            from page title&nbsp;<input type="radio" name="discovermethod" value="title" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
            from page title (splitted)&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
            from page author&nbsp;<input type="radio" name="discovermethod" value="author" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/></dt>
        <dd>
-        <input type="radio" name="discovermethod" value="none" checked="checked" /> no auto-discovery (empty vocabulary)&nbsp;&nbsp;
+            <dl>
-        <input type="radio" name="discovermethod" value="path" /> from file name&nbsp;&nbsp;
+            <dt><i>Objectspace</i></dt>
-        <input type="radio" name="discovermethod" value="title" /> from page title&nbsp;&nbsp;
+            <dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd>
-        <input type="radio" name="discovermethod" value="titlesplitted" /> from page title (splitted)&nbsp;&nbsp;
+            </dl>
-        <input type="radio" name="discovermethod" value="author" /> from page author</dd>
+        </dd>
-        <input type="radio" name="discovermethod" value="csv" /> from a csv file</dd>
+        <dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
        <dd>
            <dl>
            <dt><i>File Path</i></dt>
            <dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd>
            <dt><i>Column for Literals</i></dt>
            <dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
            <dt><i>Column for Object Link (optional)</i></dt>
            <dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
            <dt><i>Charset of Import File</i></dt>
            <dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
            </dl>
        </dd>
        <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
      </dl>
    </fieldset>
@ -136,12 +154,6 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
        <dt>Prefix</dt><dd>#[prefix]#</dd>
        <dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
      </dl>
      <p>This produces the following triples in the triplestore if a term or synonym matches in a document:</p>
      <dl>
        <dt>Triple #1</dt><dd>#[triple1]#</dd>
        <dt>Triple #2</dt><dd>#[triple2]#</dd>
        <dt>more Triples for linking into objectspace</dt><dd>#[tripleN]#</dd>
      </dl>
      <table class="sortable" border="0">
      <tr class="TableHeader" valign="bottom">
        <td>Modify</td>
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -18,23 +18,26 @@
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.TreeMap;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.lod.vocabulary.DCTerms;
 import net.yacy.cora.lod.vocabulary.Owl;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
 import net.yacy.cora.lod.vocabulary.YaCyMetadata;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.data.WorkTables;
 import net.yacy.document.LibraryProvider;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.search.Switchboard;
@ -55,23 +58,43 @@ public class Vocabulary_p {
        if (vocabulary == null) vocabularyName = null;
        if (post != null) {
            try {
-                if (vocabulary == null) {
+                // create a vocabulary
-                    // create a vocabulary
+                if (vocabulary == null && discovername != null && discovername.length() > 0) {
-                    if (discovername != null && discovername.length() > 0) {
+                    // store this call as api call
-                        String discoverobjectspace = post.get("discoverobjectspace", "");
+                    sb.tables.recordAPICall(post, "Vocabulary_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "vocabulary creation for " + discovername);
-                        MultiProtocolURL discoveruri = null;
+                    // get details of creation
-                        if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
+                    String discoverobjectspace = post.get("discoverobjectspace", "");
-                        if (discoveruri == null) discoverobjectspace = "";
+                    MultiProtocolURL discoveruri = null;
-                        Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
+                    if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
-                        File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
+                    if (discoveruri == null) discoverobjectspace = "";
-                        boolean discoverNot = post.get("discovermethod", "").equals("none");
+                    Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
-                        boolean discoverFromPath = post.get("discovermethod", "").equals("path");
+                    File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
-                        boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
+                    final boolean discoverNot = post.get("discovermethod", "").equals("none");
-                        boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
+                    final boolean discoverFromPath = post.get("discovermethod", "").equals("path");
-                        boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
+                    final boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
-                        Segment segment = sb.index;
+                    final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
-                        String t;
+                    final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
-                        if (!discoverNot) {
+                    final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
                    final String discoverFromCSVPath = post.get("discoverpath", "");
                    final String discoverFromCSVCharset = post.get("charset", "UTF-8");
                    final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
                    final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
                    final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
                    Segment segment = sb.index;
                    String t;
                    if (!discoverNot) {
                        if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
                            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
                            String line = null;
                            while ((line = r.readLine()) != null) {
                                String[] l = line.split(";");
                                String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral];
                                String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink];
                                if (literal != null && literal.length() > 0) {
                                    table.put(literal, new Tagging.SOTuple(Tagging.normalizeTerm(literal), objectlink == null ? "" : objectlink));
                                }
                            }
                        } else {
                            Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
                            while (ui.hasNext()) {
                                DigestURL u = ui.next();
@ -118,11 +141,11 @@ public class Vocabulary_p {
                                }
                            }
                        }
                        Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
                        LibraryProvider.autotagging.addVocabulary(newvoc);
                        vocabularyName = discovername;
                        vocabulary = newvoc;
                    }
                    Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
                    LibraryProvider.autotagging.addVocabulary(newvoc);
                    vocabularyName = discovername;
                    vocabulary = newvoc;
                } else {
                    // check if objectspace was set
                    vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()));
@ -186,7 +209,6 @@ public class Vocabulary_p {
        } else {
            prop.put("edit", 1);
            boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists();
            String yacyurl = YaCyMetadata.hashURI("[hash]".getBytes());
            prop.put("edit_editable", editable ? 1 : 0);
            prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : "");
            prop.putHTML("edit_name", vocabulary.getName());
@ -197,9 +219,6 @@ public class Vocabulary_p {
            prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
            prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace());
            prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate());
            prop.putXML("edit_triple1", "<" + yacyurl + "> <" + vocabulary.getPredicate() + "> \"[discovered-tags-commaseparated]\"");
            prop.putXML("edit_triple2", "<" + yacyurl + "> <" + Owl.SameAs.getPredicate() + "> <[document-url]>");
            prop.putXML("edit_tripleN", vocabulary.getObjectspace() == null ? "none - missing objectspace" : "<" + yacyurl + "> <" + DCTerms.references.getPredicate() + "> \"[object-link]#[tag]\" .");
            int c = 0;
            boolean dark = false;
            int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length();
@ -231,6 +250,15 @@ public class Vocabulary_p {
        }
        // make charset list for import method selector
        int c = 0;
        for (String cs: Charset.availableCharsets().keySet()) {
            prop.putHTML("create_charset_" + c + "_name", cs);
            prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0);
            c++;
        }
        prop.put("create_charset", c);
        // return rewrite properties
        return prop;
    }
--- a/source/net/yacy/cora/lod/vocabulary/Tagging.java
+++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java
@ -22,8 +22,10 @@ package net.yacy.cora.lod.vocabulary;
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.Map;
@ -34,6 +36,7 @@ import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.geo.GeoLocation;
 import net.yacy.cora.geo.Locations;
 import net.yacy.cora.storage.Files;
@ -158,7 +161,8 @@ public class Tagging {
 			    }
 			}
        } else {
-	        BufferedWriter w = new BufferedWriter(new FileWriter(propFile));
+            //
            BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(propFile), UTF8.charset.name()));
 	        if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n");
 	        for (Map.Entry<String, SOTuple> e: table.entrySet()) {
 	            String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV();