added option in vocabulary editor to import CSV files with different

encodings (preselected windows-type character encoding which is typical
for CSV files). Fixed also other problems with character encoding in
dictionary files. Automatically generated vocabularies are now also
noted in the API steering.
pull/1/head
Michael Peter Christen 10 years ago
parent b558433211
commit ec9d021568

@ -107,17 +107,35 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term. This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
This works best with wikis. Try to use a wiki url as objectspace path. This works best with wikis. Try to use a wiki url as objectspace path.
<dl> <dl>
<dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd> <dt>Vocabulary Name<br><i>(this will become the name of a search facet)</i></dt>
<dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" /> <dd><input type="text" name="discovername" value="" size="78" maxlength="128" onkeydown="for (i=0; i < 6; i++) document.getElementsByName('discovermethod')[i].disabled=''"/></dd>
<div id="searchresults"></div></dd> <hr>
<dt>Discover Terms:</dt> <dt>Empty Vocabulary&nbsp;<input type="radio" name="discovermethod" value="none" checked="checked" disabled="disabled"/></dt>
<dd></dd>
<dt>Auto-Discover&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</br>
from file name&nbsp;<input type="radio" name="discovermethod" value="path" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
from page title&nbsp;<input type="radio" name="discovermethod" value="title" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
from page title (splitted)&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/><br/>
from page author&nbsp;<input type="radio" name="discovermethod" value="author" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/></dt>
<dd> <dd>
<input type="radio" name="discovermethod" value="none" checked="checked" /> no auto-discovery (empty vocabulary)&nbsp;&nbsp; <dl>
<input type="radio" name="discovermethod" value="path" /> from file name&nbsp;&nbsp; <dt><i>Objectspace</i></dt>
<input type="radio" name="discovermethod" value="title" /> from page title&nbsp;&nbsp; <dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd>
<input type="radio" name="discovermethod" value="titlesplitted" /> from page title (splitted)&nbsp;&nbsp; </dl>
<input type="radio" name="discovermethod" value="author" /> from page author</dd> </dd>
<input type="radio" name="discovermethod" value="csv" /> from a csv file</dd> <dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
<dd>
<dl>
<dt><i>File Path</i></dt>
<dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd>
<dt><i>Column for Literals</i></dt>
<dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
<dt><i>Column for Object Link (optional)</i></dt>
<dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
<dt><i>Charset of Import File</i></dt>
<dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
</dl>
</dd>
<dt></dt><dd><input type="submit" name="create" value="Create" /></dd> <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
</dl> </dl>
</fieldset> </fieldset>
@ -136,12 +154,6 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<dt>Prefix</dt><dd>#[prefix]#</dd> <dt>Prefix</dt><dd>#[prefix]#</dd>
<dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd> <dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
</dl> </dl>
<p>This produces the following triples in the triplestore if a term or synonym matches in a document:</p>
<dl>
<dt>Triple #1</dt><dd>#[triple1]#</dd>
<dt>Triple #2</dt><dd>#[triple2]#</dd>
<dt>more Triples for linking into objectspace</dt><dd>#[tripleN]#</dd>
</dl>
<table class="sortable" border="0"> <table class="sortable" border="0">
<tr class="TableHeader" valign="bottom"> <tr class="TableHeader" valign="bottom">
<td>Modify</td> <td>Modify</td>

@ -18,23 +18,26 @@
* If not, see <http://www.gnu.org/licenses/>. * If not, see <http://www.gnu.org/licenses/>.
*/ */
import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.Collection; import java.util.Collection;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.lod.vocabulary.DCTerms; import net.yacy.cora.lod.vocabulary.DCTerms;
import net.yacy.cora.lod.vocabulary.Owl;
import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.Tagging.SOTuple; import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.WorkTables;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@ -55,23 +58,43 @@ public class Vocabulary_p {
if (vocabulary == null) vocabularyName = null; if (vocabulary == null) vocabularyName = null;
if (post != null) { if (post != null) {
try { try {
if (vocabulary == null) {
// create a vocabulary // create a vocabulary
if (discovername != null && discovername.length() > 0) { if (vocabulary == null && discovername != null && discovername.length() > 0) {
// store this call as api call
sb.tables.recordAPICall(post, "Vocabulary_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "vocabulary creation for " + discovername);
// get details of creation
String discoverobjectspace = post.get("discoverobjectspace", ""); String discoverobjectspace = post.get("discoverobjectspace", "");
MultiProtocolURL discoveruri = null; MultiProtocolURL discoveruri = null;
if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {} if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
if (discoveruri == null) discoverobjectspace = ""; if (discoveruri == null) discoverobjectspace = "";
Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>(); Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
boolean discoverNot = post.get("discovermethod", "").equals("none"); final boolean discoverNot = post.get("discovermethod", "").equals("none");
boolean discoverFromPath = post.get("discovermethod", "").equals("path"); final boolean discoverFromPath = post.get("discovermethod", "").equals("path");
boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); final boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
final String discoverFromCSVPath = post.get("discoverpath", "");
final String discoverFromCSVCharset = post.get("charset", "UTF-8");
final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
Segment segment = sb.index; Segment segment = sb.index;
String t; String t;
if (!discoverNot) { if (!discoverNot) {
if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
String line = null;
while ((line = r.readLine()) != null) {
String[] l = line.split(";");
String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral];
String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink];
if (literal != null && literal.length() > 0) {
table.put(literal, new Tagging.SOTuple(Tagging.normalizeTerm(literal), objectlink == null ? "" : objectlink));
}
}
} else {
Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000); Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
while (ui.hasNext()) { while (ui.hasNext()) {
DigestURL u = ui.next(); DigestURL u = ui.next();
@ -118,11 +141,11 @@ public class Vocabulary_p {
} }
} }
} }
}
Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
LibraryProvider.autotagging.addVocabulary(newvoc); LibraryProvider.autotagging.addVocabulary(newvoc);
vocabularyName = discovername; vocabularyName = discovername;
vocabulary = newvoc; vocabulary = newvoc;
}
} else { } else {
// check if objectspace was set // check if objectspace was set
vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace())); vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()));
@ -186,7 +209,6 @@ public class Vocabulary_p {
} else { } else {
prop.put("edit", 1); prop.put("edit", 1);
boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists(); boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists();
String yacyurl = YaCyMetadata.hashURI("[hash]".getBytes());
prop.put("edit_editable", editable ? 1 : 0); prop.put("edit_editable", editable ? 1 : 0);
prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : ""); prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : "");
prop.putHTML("edit_name", vocabulary.getName()); prop.putHTML("edit_name", vocabulary.getName());
@ -197,9 +219,6 @@ public class Vocabulary_p {
prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX); prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()); prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace());
prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate()); prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate());
prop.putXML("edit_triple1", "<" + yacyurl + "> <" + vocabulary.getPredicate() + "> \"[discovered-tags-commaseparated]\"");
prop.putXML("edit_triple2", "<" + yacyurl + "> <" + Owl.SameAs.getPredicate() + "> <[document-url]>");
prop.putXML("edit_tripleN", vocabulary.getObjectspace() == null ? "none - missing objectspace" : "<" + yacyurl + "> <" + DCTerms.references.getPredicate() + "> \"[object-link]#[tag]\" .");
int c = 0; int c = 0;
boolean dark = false; boolean dark = false;
int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length(); int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length();
@ -231,6 +250,15 @@ public class Vocabulary_p {
} }
// make charset list for import method selector
int c = 0;
for (String cs: Charset.availableCharsets().keySet()) {
prop.putHTML("create_charset_" + c + "_name", cs);
prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0);
c++;
}
prop.put("create_charset", c);
// return rewrite properties // return rewrite properties
return prop; return prop;
} }

@ -22,8 +22,10 @@ package net.yacy.cora.lod.vocabulary;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashSet; import java.util.HashSet;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
@ -34,6 +36,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.geo.Locations; import net.yacy.cora.geo.Locations;
import net.yacy.cora.storage.Files; import net.yacy.cora.storage.Files;
@ -158,7 +161,8 @@ public class Tagging {
} }
} }
} else { } else {
BufferedWriter w = new BufferedWriter(new FileWriter(propFile)); //
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(propFile), UTF8.charset.name()));
if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n"); if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n");
for (Map.Entry<String, SOTuple> e: table.entrySet()) { for (Map.Entry<String, SOTuple> e: table.entrySet()) {
String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV(); String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV();

Loading…
Cancel
Save