diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index 73e76ce7b..b1c073315 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -111,10 +111,11 @@ To see a list of all APIs, please visit the Please provide a CSV file path. + :: :: - :: + :: :: + :: #(/csvFileStatus)# #(vocabWriteError)# :: @@ -174,7 +175,7 @@ To see a list of all APIs, please visit the - +
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 5ef30de59..5944c0fa0 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -20,7 +20,6 @@ import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; @@ -40,18 +39,22 @@ import java.util.regex.Pattern; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.DCTerms; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging.SOTuple; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.retrieval.StreamResponse; import net.yacy.data.TransactionManager; import net.yacy.data.WorkTables; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.util.FileUtils; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segment; @@ -105,35 +108,54 @@ public class Vocabulary_p { final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv"); final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " "); - final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null; final Segment segment = sb.index; String t; int csvFileStatus = 0; if (!discoverNot) { if (discoverFromCSV) { - if(discoverFromCSVFile != null) { - final String csvPath = discoverFromCSVFile.getAbsolutePath(); - if (!discoverFromCSVFile.exists()) { - csvFileStatus = 2; - prop.put("create_csvFileStatus_csvPath", csvPath); - } else if (!discoverFromCSVFile.canRead()) { - csvFileStatus = 3; - prop.put("create_csvFileStatus_csvPath", csvPath); - } else if (discoverFromCSVFile.isDirectory()) { - csvFileStatus = 4; - prop.put("create_csvFileStatus_csvPath", csvPath); - } else { - try { - handleDiscoverFromCSV(post, table, discoverFromCSVFile); - } catch(final IOException e) { - LOG.warn("Could not read CSV file at " + discoverFromCSVFile, e); - csvFileStatus = 3; - prop.put("create_csvFileStatus_csvPath", csvPath); - } - } - } else { + if(discoverFromCSVPath.isEmpty()) { csvFileStatus = 1; + } else { + DigestURL csvUrl = null; + if(discoverFromCSVPath.contains("://")) { + try { + csvUrl = new DigestURL(discoverFromCSVPath); + } catch(final MalformedURLException e) { + csvFileStatus = 5; + prop.put("create_csvFileStatus_csvUrl", discoverFromCSVPath); + } + } else { + final File discoverFromCSVFile = new File(discoverFromCSVPath); + final String csvPath = discoverFromCSVFile.getAbsolutePath(); + if (!discoverFromCSVFile.exists()) { + csvFileStatus = 2; + prop.put("create_csvFileStatus_csvPath", csvPath); + } else if (!discoverFromCSVFile.canRead()) { + csvFileStatus = 3; + prop.put("create_csvFileStatus_csvFile", csvPath); + } else if (discoverFromCSVFile.isDirectory()) { + csvFileStatus = 4; + prop.put("create_csvFileStatus_csvPath", csvPath); + } else { + try { + csvUrl = new DigestURL(discoverFromCSVFile); + } catch(final MalformedURLException e) { + csvFileStatus = 5; + prop.put("create_csvFileStatus_csvUrl", "file://" + discoverFromCSVFile.getAbsolutePath()); + } + } + } + + if(csvUrl != null) { + try { + handleDiscoverFromCSV(sb, post, table, csvUrl); + } catch(final IOException e) { + LOG.warn("Could not read CSV file at " + csvUrl, e); + csvFileStatus = 3; + prop.put("create_csvFileStatus_csvFile", csvUrl.toString()); + } + } } } else { Iterator ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000); @@ -432,15 +454,16 @@ public class Vocabulary_p { /** * Fill the vocabulary table from a CSV file. + * @param sb the main Switchbaord instance. Must not be null. * @param post current request parameters. Must not be null. * @param table the vocabulary table to fill. Must not be null. - * @param discoverFromCSVFile. Must not be null. + * @param csvFileUrl the file URL. Must not be null. * @throws IOException when a read/write error occurred * @throws UnsupportedEncodingException * @throws FileNotFoundException when the file does not exists or can not be read for some reason. */ - protected static void handleDiscoverFromCSV(final serverObjects post, final Map table, - final File discoverFromCSVFile) + protected static void handleDiscoverFromCSV(final Switchboard sb, final serverObjects post, final Map table, + final DigestURL csvFileUrl) throws IOException, UnsupportedEncodingException, FileNotFoundException { String charsetName = post.get("charset", StandardCharsets.UTF_8.name()); final String columnSeparator = post.get("columnSeparator", ";"); @@ -451,22 +474,53 @@ public class Vocabulary_p { final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1); final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms"); final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn"); - + + final Pattern separatorPattern = Pattern.compile(columnSeparator); + // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html if (charsetName.equals("autodetect")) { - List charsets = FileUtils.detectCharset(discoverFromCSVFile); - charsetName = charsets.get(0); - ConcurrentLog.info("FileUtils", "detected charset: " + charsetName + " used to read " + discoverFromCSVFile.toString()); + + try (final StreamResponse streamResponse = sb.loader.openInputStream( + sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER, + ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) { + if(streamResponse == null || streamResponse.getContentStream() == null) { + throw new IOException("Could not get CSV content at " + csvFileUrl); + } + + charsetName = streamResponse.getResponse().getCharacterEncoding(); + + if(charsetName == null) { + /* Charset not provided in response headers : try to detect it from content */ + final List charsets = FileUtils.detectCharset(streamResponse.getContentStream()); + charsetName = charsets.get(0); + LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString()); + } else { + LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString()); + /* Use now the open stream */ + try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName); + final BufferedReader bufferedReader = new BufferedReader(reader);) { + discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms, + discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern, + bufferedReader); + } + return; + } + } } - final Pattern separatorPattern = Pattern.compile(columnSeparator); - - // read file (try-with-resource to close resources automatically) - try (final FileInputStream fileStream = new FileInputStream(discoverFromCSVFile); - final InputStreamReader reader = new InputStreamReader(fileStream, charsetName); - final BufferedReader bufferedReader = new BufferedReader(reader);) { - discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms, - discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern, - bufferedReader); + + // when autodetection of content charset has been selected, a remote resource may opened again, but has some chances to be now in cache + try(final StreamResponse streamResponse = sb.loader.openInputStream( + sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER, + ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) { + if(streamResponse == null || streamResponse.getContentStream() == null) { + throw new IOException("Could not get CSV content at " + csvFileUrl); + } + try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName); + final BufferedReader bufferedReader = new BufferedReader(reader);) { + discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms, + discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern, + bufferedReader); + } } } diff --git a/source/net/yacy/kelondro/util/FileUtils.java b/source/net/yacy/kelondro/util/FileUtils.java index 4961b01b0..9f6df876f 100644 --- a/source/net/yacy/kelondro/util/FileUtils.java +++ b/source/net/yacy/kelondro/util/FileUtils.java @@ -1028,32 +1028,35 @@ public final class FileUtils { } /** - * auto-detect the charset of a file - * used code from http://jchardet.sourceforge.net/; - * see also: http://www-archive.mozilla.org/projects/intl/chardet.html - * @param file + * Auto-detect the charset of content in a stream. + * Used code from http://jchardet.sourceforge.net/. + * Don't forget to close the stream in caller. + * @see
chardet + * @param inStream an open stream * @return a list of probable charsets - * @throws IOException + * @throws IOException when a read error occured */ - public static List detectCharset(File file) throws IOException { + public static List detectCharset(final InputStream inStream) throws IOException { // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html List result; - try (BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file))) { // try-with-resource to close inputstream - nsDetector det = new nsDetector(nsPSMDetector.ALL); - byte[] buf = new byte[1024] ; - int len; - boolean done = false ; - boolean isAscii = true ; - while ((len = imp.read(buf,0,buf.length)) != -1) { - if (isAscii) isAscii = det.isAscii(buf,len); - if (!isAscii && !done) done = det.DoIt(buf,len, false); - } det.DataEnd(); - result = new ArrayList<>(); + nsDetector det = new nsDetector(nsPSMDetector.ALL); + byte[] buf = new byte[1024] ; + int len; + boolean done = false ; + boolean isAscii = true ; + while ((len = inStream.read(buf,0,buf.length)) != -1) { if (isAscii) { - result.add(StandardCharsets.US_ASCII.name()); - } else { - for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch" + isAscii = det.isAscii(buf,len); + } + if (!isAscii && !done) { + done = det.DoIt(buf,len, false); } + } det.DataEnd(); + result = new ArrayList<>(); + if (isAscii) { + result.add(StandardCharsets.US_ASCII.name()); + } else { + for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch" } return result; } @@ -1070,8 +1073,9 @@ public final class FileUtils { Thread t = new Thread("FileUtils.checkCharset") { @Override public void run() { - try { - List charsets = FileUtils.detectCharset(file); + try (final FileInputStream fileStream = new FileInputStream(file); + final BufferedInputStream imp = new BufferedInputStream(fileStream)) { // try-with-resource to close resources + List charsets = FileUtils.detectCharset(imp); if (charsets.contains(givenCharset)) { ConcurrentLog.info("checkCharset", "appropriate charset '" + givenCharset + "' for import of " + file + ", is part one detected " + charsets); } else {