added option in vocabulary editor to import CSV files with different

encodings (preselected windows-type character encoding which is typical
for CSV files). Fixed also other problems with character encoding in
dictionary files. Automatically generated vocabularies are now also
noted in the API steering.
pull/1/head
Michael Peter Christen 10 years ago
parent b558433211
commit ec9d021568

@ -90,7 +90,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<dd> <dd>
<select name="vocabulary" onchange="this.form.submit()"> <select name="vocabulary" onchange="this.form.submit()">
#{vocabularyset}# #{vocabularyset}#
<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option> <option value="#[name]#"#(selected)#:: selected="selected"#(/selected)#>#[name]#</option>
#{/vocabularyset}# #{/vocabularyset}#
</select> </select>
</dd> </dd>
@ -107,17 +107,35 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term. This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
This works best with wikis. Try to use a wiki url as objectspace path. This works best with wikis. Try to use a wiki url as objectspace path.
<dl> <dl>
<dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd> <dt>Vocabulary Name<br><i>(this will become the name of a search facet)</i></dt>
<dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" /> <dd><input type="text" name="discovername" value="" size="78" maxlength="128" onkeydown="for (i=0; i < 6; i++) document.getElementsByName('discovermethod')[i].disabled=''"/></dd>
<div id="searchresults"></div></dd> <hr>
<dt>Discover Terms:</dt> <dt>Empty Vocabulary&nbsp;<input type="radio" name="discovermethod" value="none" checked="checked" disabled="disabled"/></dt>
<dd></dd>
<dt>Auto-Discover&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</br>
from file name&nbsp;<input type="radio" name="discovermethod" value="path" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
from page title&nbsp;<input type="radio" name="discovermethod" value="title" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
from page title (splitted)&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
from page author&nbsp;<input type="radio" name="discovermethod" value="author" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/></dt>
<dd> <dd>
<input type="radio" name="discovermethod" value="none" checked="checked" /> no auto-discovery (empty vocabulary)&nbsp;&nbsp; <dl>
<input type="radio" name="discovermethod" value="path" /> from file name&nbsp;&nbsp; <dt><i>Objectspace</i></dt>
<input type="radio" name="discovermethod" value="title" /> from page title&nbsp;&nbsp; <dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd>
<input type="radio" name="discovermethod" value="titlesplitted" /> from page title (splitted)&nbsp;&nbsp; </dl>
<input type="radio" name="discovermethod" value="author" /> from page author</dd> </dd>
<input type="radio" name="discovermethod" value="csv" /> from a csv file</dd> <dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
<dd>
<dl>
<dt><i>File Path</i></dt>
<dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd>
<dt><i>Column for Literals</i></dt>
<dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
<dt><i>Column for Object Link (optional)</i></dt>
<dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
<dt><i>Charset of Import File</i></dt>
<dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
</dl>
</dd>
<dt></dt><dd><input type="submit" name="create" value="Create" /></dd> <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
</dl> </dl>
</fieldset> </fieldset>
@ -136,12 +154,6 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<dt>Prefix</dt><dd>#[prefix]#</dd> <dt>Prefix</dt><dd>#[prefix]#</dd>
<dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd> <dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
</dl> </dl>
<p>This produces the following triples in the triplestore if a term or synonym matches in a document:</p>
<dl>
<dt>Triple #1</dt><dd>#[triple1]#</dd>
<dt>Triple #2</dt><dd>#[triple2]#</dd>
<dt>more Triples for linking into objectspace</dt><dd>#[tripleN]#</dd>
</dl>
<table class="sortable" border="0"> <table class="sortable" border="0">
<tr class="TableHeader" valign="bottom"> <tr class="TableHeader" valign="bottom">
<td>Modify</td> <td>Modify</td>

@ -18,23 +18,26 @@
* If not, see <http://www.gnu.org/licenses/>. * If not, see <http://www.gnu.org/licenses/>.
*/ */
import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.Collection; import java.util.Collection;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.lod.vocabulary.DCTerms; import net.yacy.cora.lod.vocabulary.DCTerms;
import net.yacy.cora.lod.vocabulary.Owl;
import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.Tagging.SOTuple; import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.WorkTables;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@ -55,23 +58,43 @@ public class Vocabulary_p {
if (vocabulary == null) vocabularyName = null; if (vocabulary == null) vocabularyName = null;
if (post != null) { if (post != null) {
try { try {
if (vocabulary == null) { // create a vocabulary
// create a vocabulary if (vocabulary == null && discovername != null && discovername.length() > 0) {
if (discovername != null && discovername.length() > 0) { // store this call as api call
String discoverobjectspace = post.get("discoverobjectspace", ""); sb.tables.recordAPICall(post, "Vocabulary_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "vocabulary creation for " + discovername);
MultiProtocolURL discoveruri = null; // get details of creation
if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {} String discoverobjectspace = post.get("discoverobjectspace", "");
if (discoveruri == null) discoverobjectspace = ""; MultiProtocolURL discoveruri = null;
Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>(); if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); if (discoveruri == null) discoverobjectspace = "";
boolean discoverNot = post.get("discovermethod", "").equals("none"); Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
boolean discoverFromPath = post.get("discovermethod", "").equals("path"); File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); final boolean discoverNot = post.get("discovermethod", "").equals("none");
boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); final boolean discoverFromPath = post.get("discovermethod", "").equals("path");
boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); final boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
Segment segment = sb.index; final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
String t; final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
if (!discoverNot) { final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
final String discoverFromCSVPath = post.get("discoverpath", "");
final String discoverFromCSVCharset = post.get("charset", "UTF-8");
final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
Segment segment = sb.index;
String t;
if (!discoverNot) {
if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
String line = null;
while ((line = r.readLine()) != null) {
String[] l = line.split(";");
String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral];
String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink];
if (literal != null && literal.length() > 0) {
table.put(literal, new Tagging.SOTuple(Tagging.normalizeTerm(literal), objectlink == null ? "" : objectlink));
}
}
} else {
Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000); Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
while (ui.hasNext()) { while (ui.hasNext()) {
DigestURL u = ui.next(); DigestURL u = ui.next();
@ -118,11 +141,11 @@ public class Vocabulary_p {
} }
} }
} }
Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
LibraryProvider.autotagging.addVocabulary(newvoc);
vocabularyName = discovername;
vocabulary = newvoc;
} }
Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
LibraryProvider.autotagging.addVocabulary(newvoc);
vocabularyName = discovername;
vocabulary = newvoc;
} else { } else {
// check if objectspace was set // check if objectspace was set
vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace())); vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()));
@ -186,7 +209,6 @@ public class Vocabulary_p {
} else { } else {
prop.put("edit", 1); prop.put("edit", 1);
boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists(); boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists();
String yacyurl = YaCyMetadata.hashURI("[hash]".getBytes());
prop.put("edit_editable", editable ? 1 : 0); prop.put("edit_editable", editable ? 1 : 0);
prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : ""); prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : "");
prop.putHTML("edit_name", vocabulary.getName()); prop.putHTML("edit_name", vocabulary.getName());
@ -197,9 +219,6 @@ public class Vocabulary_p {
prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX); prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()); prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace());
prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate()); prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate());
prop.putXML("edit_triple1", "<" + yacyurl + "> <" + vocabulary.getPredicate() + "> \"[discovered-tags-commaseparated]\"");
prop.putXML("edit_triple2", "<" + yacyurl + "> <" + Owl.SameAs.getPredicate() + "> <[document-url]>");
prop.putXML("edit_tripleN", vocabulary.getObjectspace() == null ? "none - missing objectspace" : "<" + yacyurl + "> <" + DCTerms.references.getPredicate() + "> \"[object-link]#[tag]\" .");
int c = 0; int c = 0;
boolean dark = false; boolean dark = false;
int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length(); int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length();
@ -231,6 +250,15 @@ public class Vocabulary_p {
} }
// make charset list for import method selector
int c = 0;
for (String cs: Charset.availableCharsets().keySet()) {
prop.putHTML("create_charset_" + c + "_name", cs);
prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0);
c++;
}
prop.put("create_charset", c);
// return rewrite properties // return rewrite properties
return prop; return prop;
} }

@ -22,8 +22,10 @@ package net.yacy.cora.lod.vocabulary;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashSet; import java.util.HashSet;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
@ -34,6 +36,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.geo.Locations; import net.yacy.cora.geo.Locations;
import net.yacy.cora.storage.Files; import net.yacy.cora.storage.Files;
@ -158,7 +161,8 @@ public class Tagging {
} }
} }
} else { } else {
BufferedWriter w = new BufferedWriter(new FileWriter(propFile)); //
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(propFile), UTF8.charset.name()));
if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n"); if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n");
for (Map.Entry<String, SOTuple> e: table.entrySet()) { for (Map.Entry<String, SOTuple> e: table.entrySet()) {
String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV(); String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV();

Loading…
Cancel
Save