added option in vocabulary editor to import CSV files with different

encodings (preselected windows-type character encoding which is typical for CSV files). Fixed also other problems with character encoding in dictionary files. Automatically generated vocabularies are now also noted in the API steering.
10 years ago · ec9d021568
parent b558433211
commit ec9d021568
3 changed files with 90 additions and 46 deletions
--- a/htroot/Vocabulary_p.html
+++ b/htroot/Vocabulary_p.html
@ -107,17 +107,35 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
      This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
      This works best with wikis. Try to use a wiki url as objectspace path.
      <dl>
-        <dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd>
+        <dt>Vocabulary Name<br><i>(this will become the name of a search facet)</i></dt>
-        <dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" />
+        <dd><input type="text" name="discovername" value="" size="78" maxlength="128" onkeydown="for (i=0; i < 6; i++) document.getElementsByName('discovermethod')[i].disabled=''"/></dd>
-            <div id="searchresults"></div></dd>
+        <hr>
-        <dt>Discover Terms:</dt>
+        <dt>Empty Vocabulary&nbsp;<input type="radio" name="discovermethod" value="none" checked="checked" disabled="disabled"/></dt>
        <dd></dd>
        <dt>Auto-Discover&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</br>
            from file name&nbsp;<input type="radio" name="discovermethod" value="path" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
            from page title&nbsp;<input type="radio" name="discovermethod" value="title" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
            from page title (splitted)&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
            from page author&nbsp;<input type="radio" name="discovermethod" value="author" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/></dt>
        <dd>
-        <input type="radio" name="discovermethod" value="none" checked="checked" /> no auto-discovery (empty vocabulary)&nbsp;&nbsp;
+            <dl>
-        <input type="radio" name="discovermethod" value="path" /> from file name&nbsp;&nbsp;
+            <dt><i>Objectspace</i></dt>
-        <input type="radio" name="discovermethod" value="title" /> from page title&nbsp;&nbsp;
+            <dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd>
-        <input type="radio" name="discovermethod" value="titlesplitted" /> from page title (splitted)&nbsp;&nbsp;
+            </dl>
-        <input type="radio" name="discovermethod" value="author" /> from page author</dd>
+        </dd>
-        <input type="radio" name="discovermethod" value="csv" /> from a csv file</dd>
+        <dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
        <dd>
            <dl>
            <dt><i>File Path</i></dt>
            <dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd>
            <dt><i>Column for Literals</i></dt>
            <dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
            <dt><i>Column for Object Link (optional)</i></dt>
            <dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
            <dt><i>Charset of Import File</i></dt>
            <dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
            </dl>
        </dd>
        <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
      </dl>
    </fieldset>
@ -136,12 +154,6 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
        <dt>Prefix</dt><dd>#[prefix]#</dd>
        <dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
      </dl>
      <p>This produces the following triples in the triplestore if a term or synonym matches in a document:</p>
      <dl>
        <dt>Triple #1</dt><dd>#[triple1]#</dd>
        <dt>Triple #2</dt><dd>#[triple2]#</dd>
        <dt>more Triples for linking into objectspace</dt><dd>#[tripleN]#</dd>
      </dl>
      <table class="sortable" border="0">
      <tr class="TableHeader" valign="bottom">
        <td>Modify</td>
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -18,23 +18,26 @@
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.TreeMap;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.lod.vocabulary.DCTerms;
 import net.yacy.cora.lod.vocabulary.Owl;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
 import net.yacy.cora.lod.vocabulary.YaCyMetadata;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.data.WorkTables;
 import net.yacy.document.LibraryProvider;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.search.Switchboard;
@ -55,23 +58,43 @@ public class Vocabulary_p {
        if (vocabulary == null) vocabularyName = null;
        if (post != null) {
            try {
                if (vocabulary == null) {
                // create a vocabulary
-                    if (discovername != null && discovername.length() > 0) {
+                if (vocabulary == null && discovername != null && discovername.length() > 0) {
                    // store this call as api call
                    sb.tables.recordAPICall(post, "Vocabulary_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "vocabulary creation for " + discovername);
                    // get details of creation
                    String discoverobjectspace = post.get("discoverobjectspace", "");
                    MultiProtocolURL discoveruri = null;
                    if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
                    if (discoveruri == null) discoverobjectspace = "";
-                        Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
+                    Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
                    File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
-                        boolean discoverNot = post.get("discovermethod", "").equals("none");
+                    final boolean discoverNot = post.get("discovermethod", "").equals("none");
-                        boolean discoverFromPath = post.get("discovermethod", "").equals("path");
+                    final boolean discoverFromPath = post.get("discovermethod", "").equals("path");
-                        boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
+                    final boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
-                        boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
+                    final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
-                        boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
+                    final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
                    final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
                    final String discoverFromCSVPath = post.get("discoverpath", "");
                    final String discoverFromCSVCharset = post.get("charset", "UTF-8");
                    final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
                    final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
                    final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
                    Segment segment = sb.index;
                    String t;
                    if (!discoverNot) {
                        if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
                            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
                            String line = null;
                            while ((line = r.readLine()) != null) {
                                String[] l = line.split(";");
                                String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral];
                                String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink];
                                if (literal != null && literal.length() > 0) {
                                    table.put(literal, new Tagging.SOTuple(Tagging.normalizeTerm(literal), objectlink == null ? "" : objectlink));
                                }
                            }
                        } else {
                            Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
                            while (ui.hasNext()) {
                                DigestURL u = ui.next();
@ -118,11 +141,11 @@ public class Vocabulary_p {
                                }
                            }
                        }
                    }
                    Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
                    LibraryProvider.autotagging.addVocabulary(newvoc);
                    vocabularyName = discovername;
                    vocabulary = newvoc;
                    }
                } else {
                    // check if objectspace was set
                    vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()));
@ -186,7 +209,6 @@ public class Vocabulary_p {
        } else {
            prop.put("edit", 1);
            boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists();
            String yacyurl = YaCyMetadata.hashURI("[hash]".getBytes());
            prop.put("edit_editable", editable ? 1 : 0);
            prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : "");
            prop.putHTML("edit_name", vocabulary.getName());
@ -197,9 +219,6 @@ public class Vocabulary_p {
            prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
            prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace());
            prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate());
            prop.putXML("edit_triple1", "<" + yacyurl + "> <" + vocabulary.getPredicate() + "> \"[discovered-tags-commaseparated]\"");
            prop.putXML("edit_triple2", "<" + yacyurl + "> <" + Owl.SameAs.getPredicate() + "> <[document-url]>");
            prop.putXML("edit_tripleN", vocabulary.getObjectspace() == null ? "none - missing objectspace" : "<" + yacyurl + "> <" + DCTerms.references.getPredicate() + "> \"[object-link]#[tag]\" .");
            int c = 0;
            boolean dark = false;
            int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length();
@ -231,6 +250,15 @@ public class Vocabulary_p {
        }
        // make charset list for import method selector
        int c = 0;
        for (String cs: Charset.availableCharsets().keySet()) {
            prop.putHTML("create_charset_" + c + "_name", cs);
            prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0);
            c++;
        }
        prop.put("create_charset", c);
        // return rewrite properties
        return prop;
    }
--- a/source/net/yacy/cora/lod/vocabulary/Tagging.java
+++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java
@ -22,8 +22,10 @@ package net.yacy.cora.lod.vocabulary;
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.Map;
@ -34,6 +36,7 @@ import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.geo.GeoLocation;
 import net.yacy.cora.geo.Locations;
 import net.yacy.cora.storage.Files;
@ -158,7 +161,8 @@ public class Tagging {
 			    }
 			}
        } else {
-	        BufferedWriter w = new BufferedWriter(new FileWriter(propFile));
+            //
            BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(propFile), UTF8.charset.name()));
 	        if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n");
 	        for (Map.Entry<String, SOTuple> e: table.entrySet()) {
 	            String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV();