From bd3ed5cae54761670fd89a2ebf87eb0bf18665b6 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 10 Dec 2014 13:11:51 +0100 Subject: [PATCH] added charset detection to vocabulary reader --- htroot/Vocabulary_p.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 038504c0b..f37a52480 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -44,6 +44,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.WorkTables; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; import net.yacy.server.serverObjects; @@ -92,6 +93,9 @@ public class Vocabulary_p { String t; if (!discoverNot) { if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) { + // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html + FileUtils.checkCharset(discoverFromCSVFile, discoverFromCSVCharset, true); + // read file BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset)); String line = null; Pattern semicolon = Pattern.compile(";");