From ec9d02156852a6664d00ff1fa9f4c32f5f2f2c72 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 17 Nov 2014 14:22:40 +0100 Subject: [PATCH] added option in vocabulary editor to import CSV files with different encodings (preselected windows-type character encoding which is typical for CSV files). Fixed also other problems with character encoding in dictionary files. Automatically generated vocabularies are now also noted in the API steering. --- htroot/Vocabulary_p.html | 46 ++++++---- htroot/Vocabulary_p.java | 84 ++++++++++++------- .../net/yacy/cora/lod/vocabulary/Tagging.java | 6 +- 3 files changed, 90 insertions(+), 46 deletions(-) diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index 44ca4dbde..d4418c2f0 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -90,7 +90,7 @@ To see a list of all APIs, please visit the #{vocabularyset}# - + #{/vocabularyset}# @@ -107,17 +107,35 @@ To see a list of all APIs, please visit the -
Objectspace
-
-
Discover Terms:
+
Vocabulary Name
(this will become the name of a search facet)
+
+
+
Empty Vocabulary 
+
+
Auto-Discover     
+ from file name 
+ from page title 
+ from page title (splitted) 
+ from page author 
- no auto-discovery (empty vocabulary)   - from file name   - from page title   - from page title (splitted)   - from page author
- from a csv file +
+
Objectspace
+
+
+ +
Import from a csv file 
+
+
+
File Path
+
+
Column for Literals
+
(first has index 0)
+
Column for Object Link (optional)
+
(first has index 0, if unused set -1)
+
Charset of Import File
+
+
+
@@ -136,12 +154,6 @@ To see a list of all APIs, please visit the

if set, uses the predicate
#[objectspacepredicate]# for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)# -

This produces the following triples in the triplestore if a term or synonym matches in a document:

-
-
Triple #1
#[triple1]#
-
Triple #2
#[triple2]#
-
more Triples for linking into objectspace
#[tripleN]#
-
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index dccffef60..7abd40f1b 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -18,23 +18,26 @@ * If not, see . */ +import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.net.MalformedURLException; +import java.nio.charset.Charset; import java.util.Collection; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.Map; -import java.util.TreeMap; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.lod.vocabulary.DCTerms; -import net.yacy.cora.lod.vocabulary.Owl; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging.SOTuple; -import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.data.WorkTables; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; @@ -55,23 +58,43 @@ public class Vocabulary_p { if (vocabulary == null) vocabularyName = null; if (post != null) { try { - if (vocabulary == null) { - // create a vocabulary - if (discovername != null && discovername.length() > 0) { - String discoverobjectspace = post.get("discoverobjectspace", ""); - MultiProtocolURL discoveruri = null; - if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {} - if (discoveruri == null) discoverobjectspace = ""; - Map table = new TreeMap(); - File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); - boolean discoverNot = post.get("discovermethod", "").equals("none"); - boolean discoverFromPath = post.get("discovermethod", "").equals("path"); - boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); - boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); - boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); - Segment segment = sb.index; - String t; - if (!discoverNot) { + // create a vocabulary + if (vocabulary == null && discovername != null && discovername.length() > 0) { + // store this call as api call + sb.tables.recordAPICall(post, "Vocabulary_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "vocabulary creation for " + discovername); + // get details of creation + String discoverobjectspace = post.get("discoverobjectspace", ""); + MultiProtocolURL discoveruri = null; + if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {} + if (discoveruri == null) discoverobjectspace = ""; + Map table = new LinkedHashMap(); + File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); + final boolean discoverNot = post.get("discovermethod", "").equals("none"); + final boolean discoverFromPath = post.get("discovermethod", "").equals("path"); + final boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); + final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); + final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); + final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv"); + final String discoverFromCSVPath = post.get("discoverpath", ""); + final String discoverFromCSVCharset = post.get("charset", "UTF-8"); + final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0); + final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1); + final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null; + Segment segment = sb.index; + String t; + if (!discoverNot) { + if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) { + BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset)); + String line = null; + while ((line = r.readLine()) != null) { + String[] l = line.split(";"); + String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral]; + String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink]; + if (literal != null && literal.length() > 0) { + table.put(literal, new Tagging.SOTuple(Tagging.normalizeTerm(literal), objectlink == null ? "" : objectlink)); + } + } + } else { Iterator ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000); while (ui.hasNext()) { DigestURL u = ui.next(); @@ -118,11 +141,11 @@ public class Vocabulary_p { } } } - Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); - LibraryProvider.autotagging.addVocabulary(newvoc); - vocabularyName = discovername; - vocabulary = newvoc; } + Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); + LibraryProvider.autotagging.addVocabulary(newvoc); + vocabularyName = discovername; + vocabulary = newvoc; } else { // check if objectspace was set vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace())); @@ -186,7 +209,6 @@ public class Vocabulary_p { } else { prop.put("edit", 1); boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists(); - String yacyurl = YaCyMetadata.hashURI("[hash]".getBytes()); prop.put("edit_editable", editable ? 1 : 0); prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : ""); prop.putHTML("edit_name", vocabulary.getName()); @@ -197,9 +219,6 @@ public class Vocabulary_p { prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX); prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()); prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate()); - prop.putXML("edit_triple1", "<" + yacyurl + "> <" + vocabulary.getPredicate() + "> \"[discovered-tags-commaseparated]\""); - prop.putXML("edit_triple2", "<" + yacyurl + "> <" + Owl.SameAs.getPredicate() + "> <[document-url]>"); - prop.putXML("edit_tripleN", vocabulary.getObjectspace() == null ? "none - missing objectspace" : "<" + yacyurl + "> <" + DCTerms.references.getPredicate() + "> \"[object-link]#[tag]\" ."); int c = 0; boolean dark = false; int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length(); @@ -231,6 +250,15 @@ public class Vocabulary_p { } + // make charset list for import method selector + int c = 0; + for (String cs: Charset.availableCharsets().keySet()) { + prop.putHTML("create_charset_" + c + "_name", cs); + prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0); + c++; + } + prop.put("create_charset", c); + // return rewrite properties return prop; } diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java index 6d298743c..add14db10 100644 --- a/source/net/yacy/cora/lod/vocabulary/Tagging.java +++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java @@ -22,8 +22,10 @@ package net.yacy.cora.lod.vocabulary; import java.io.BufferedWriter; import java.io.File; +import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; +import java.io.OutputStreamWriter; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Map; @@ -34,6 +36,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.geo.Locations; import net.yacy.cora.storage.Files; @@ -158,7 +161,8 @@ public class Tagging { } } } else { - BufferedWriter w = new BufferedWriter(new FileWriter(propFile)); + // + BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(propFile), UTF8.charset.name())); if (objectspace != null && objectspace.length() > 0) w.write("#objectspace:" + objectspace + "\n"); for (Map.Entry e: table.entrySet()) { String s = e.getValue() == null ? "" : e.getValue().getSynonymsCSV();
Modify