Handle escaped line breaks and separators in vocabulary import from CSV

7 years ago · 8d7099a081
parent 09f93fed0e
commit 8d7099a081
2 changed files with 304 additions and 69 deletions
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -21,11 +21,14 @@
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.Iterator;
@ -85,76 +88,14 @@ public class Vocabulary_p {
                    final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
                    final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
                    final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
-                    String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name());
-                    final String columnSeparator = post.get("columnSeparator", ";");
-                    final int lineStart = post.getInt("discoverLineStart", 0);
-                    final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
-                    final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1);
-                    final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
+
                    final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
-                    final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms");
-                    final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn");
+
                    Segment segment = sb.index;
                    String t;
                    if (!discoverNot) {
                        if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
-                            // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
-                            if (discoverFromCSVCharset.equals("autodetect")) {
-                                List<String> charsets = FileUtils.detectCharset(discoverFromCSVFile);
-                                discoverFromCSVCharset = charsets.get(0);
-                                ConcurrentLog.info("FileUtils", "detected charset: " + discoverFromCSVCharset + " used to read " + discoverFromCSVFile.toString());
-                            }
-                            // read file (try-with-resource to close inputstream automatically)
-                            try (BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset))) {
-                                String line = null;
-                                final Pattern separatorPattern = Pattern.compile(columnSeparator);
-                                Map<String, String> synonym2literal = new HashMap<>(); // helper map to check if there are double synonyms
-                                int lineIndex = -1;
-                                while ((line = r.readLine()) != null) {
-                                	lineIndex++;
-                                	if(lineIndex < lineStart) {
-                                		continue;
-                                	}
-                                    if (line.length() == 0) {
-                                    	continue;
-                                    }
-                                    String[] l = separatorPattern.split(line);
-                                    if (l.length == 0) l = new String[]{line};
-                                    String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral].trim();
-                                    if (literal == null) {
-                                    	continue;
-                                    }
-                                    literal = normalizeLiteral(literal);
-                                    String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink].trim();
-                                    if (literal.length() > 0) {
-                                        String synonyms = "";
-                                        if (discoverenrichsynonyms) {
-                                            Set<String> sy = SynonymLibrary.getSynonyms(literal);
-                                            if (sy != null) {
-                                                for (String s: sy) synonyms += "," + s;
-                                            }
-                                        } else if (discoverreadcolumn) {
-                                            synonyms = discovercolumnsynonyms < 0 || l.length <= discovercolumnsynonyms ? null : l[discovercolumnsynonyms].trim();
-                                            synonyms = normalizeLiteral(synonyms);
-                                        } else {
-                                            synonyms = Tagging.normalizeTerm(literal);
-                                        }
-                                        // check double synonyms
-                                        if (synonyms.length() > 0) {
-                                            String oldliteral = synonym2literal.get(synonyms);
-                                            if (oldliteral != null && !literal.equals(oldliteral)) {
-                                                // replace old entry with combined new
-                                                table.remove(oldliteral);
-                                                String newliteral = oldliteral + "," + literal;
-                                                literal = newliteral;
-                                            }
-                                            synonym2literal.put(synonyms, literal);
-                                        }
-                                        // store term
-                                        table.put(literal, new Tagging.SOTuple(synonyms, objectlink == null ? "" : objectlink));
-                                    }
-                                }
-                            }
+                            handleDiscoverFromCSV(post, table, discoverFromCSVFile);
                        } else {
                            Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
                            while (ui.hasNext()) {
@ -351,10 +292,210 @@ public class Vocabulary_p {
        return prop;
    }
    
-    private static String normalizeLiteral(String literal) {
-        if (literal == null) return "";
-        if (literal.length() > 0 && (literal.charAt(0) == '"' || literal.charAt(0) == '\'')) literal = literal.substring(1);
-        if (literal.length() > 0 && (literal.charAt(literal.length() - 1) == '"' || literal.charAt(literal.length() - 1) == '\'')) literal = literal.substring(0, literal.length() - 1);
+	/**
+	 * Parse a CSV content line and extract field values. When the last field of
+	 * this line starts with and unclosed escape character, the current line is
+	 * appended to the escapedContent buffer.
+	 * 
+	 * @param line
+	 *            a raw line from a CSV document. Must not be null.
+	 * @param separatorPattern
+	 *            the field separator character compiled as a Pattern instance. Must
+	 *            not be null.
+	 * @param escape
+	 *            escape character
+	 * @param multiLineContent
+	 *            eventually holds content of previous lines whose last field
+	 *            includes an escaped line separator
+	 * @return the list of field values extracted from the line, eventually empty.
+	 */
+    private static List<String> parseCSVLine(final String line, final Pattern separatorPattern, final String escape, final StringBuilder multiLineContent) {
+        final List<String> fields = new ArrayList<>();
+        String[] tokens = separatorPattern.split(line);
+        if (tokens.length == 0) {
+        	tokens = new String[]{line};
+        }
+        
+        /* Handle continuation of multi-lines field content escaped between escape char */
+        if(multiLineContent.length() > 0) {
+        	int closingEscapeIndex = -1;
+        	for(int index = 0; index < tokens.length; index++) {
+        		if(tokens[index].endsWith(escape)) {
+        			closingEscapeIndex = index;
+        			break;
+        		}
+        	}
+        	if(closingEscapeIndex >= 0) {
+        		/* End of multi-line escape */
+            	multiLineContent.append("\n").append(line);
+                tokens = separatorPattern.split(multiLineContent.toString());
+        		multiLineContent.setLength(0);
+                if (tokens.length == 0) {
+                	tokens = new String[]{line};
+                }
+        	} else {
+        		/* Multi-line escape continue */
+                multiLineContent.append("\n").append(line);
+            	return fields;
+        	}
+        }
+        
+        /* Handle separator char escaped between escape char */
+        final StringBuilder escapedSeparatorContent = new StringBuilder();
+        for(final String field : tokens) {
+        	if(escapedSeparatorContent.length() == 0) {
+        		if(field.startsWith(escape) && !field.endsWith(escape)) {
+        			/* Beginning of escape */
+        			escapedSeparatorContent.append(field).append(separatorPattern.toString());
+        			continue;
+        		}
+        	} else if(field.endsWith(escape)) {
+        		/* End of field escape */
+        		escapedSeparatorContent.append(field);
+        		fields.add(escapedSeparatorContent.toString());
+        		escapedSeparatorContent.setLength(0);
+    			continue;
+        	} else {
+        		/* Escape continue */
+        		escapedSeparatorContent.append(field).append(separatorPattern.toString());
+    			continue;
+        	}
+       		fields.add(field);
+        }
+        
+        if(escapedSeparatorContent.length() > 0) {
+        	/* Handle beginning of field content with escaped line separator */
+        	multiLineContent.setLength(0);
+            multiLineContent.append(line);
+        }
+        return fields;
+    }
+
+    /**
+     * Fill the vocabulary table from a CSV file.
+     * @param post current request parameters. Must not be null.
+     * @param table the vocabulary table to fill. Must not be null.
+     * @param discoverFromCSVFile. Must not be null.
+     * @throws IOException when a read/write error occurred
+     * @throws UnsupportedEncodingException
+     * @throws FileNotFoundException when the file does not exists or can not be read for some reason.
+     */
+	protected static void handleDiscoverFromCSV(final serverObjects post, final Map<String, Tagging.SOTuple> table,
+			final File discoverFromCSVFile)
+			throws IOException, UnsupportedEncodingException, FileNotFoundException {
+		String charsetName = post.get("charset", StandardCharsets.UTF_8.name());
+		final String columnSeparator = post.get("columnSeparator", ";");
+		final String escapeChar = "\"";
+		final int lineStart = post.getInt("discoverLineStart", 0);
+		final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
+		final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1);
+		final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
+        final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms");
+        final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn");
+		
+		// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
+		if (charsetName.equals("autodetect")) {
+		    List<String> charsets = FileUtils.detectCharset(discoverFromCSVFile);
+		    charsetName = charsets.get(0);
+		    ConcurrentLog.info("FileUtils", "detected charset: " + charsetName + " used to read " + discoverFromCSVFile.toString());
+		}
+	    final Pattern separatorPattern = Pattern.compile(columnSeparator);
+	    
+		// read file (try-with-resource to close inputstream automatically)
+		try (final BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), charsetName))) {
+		    discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
+					discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern, r);
+		}
+	}
+
+	/**
+	 * Fill the vocabulary table from reader open on CSV content.
+	 * @param table the vocabulary table to fill. Must not be null.
+	 * @param escapeChar CSV escape character (standard is double quote). Must not be null. 
+	 * @param lineStart index (zero based) of the first line to parse. Previous lines are ignored. 
+	 * @param literalsColumn index (zero based) of the column to read for literals
+	 * @param synonymsColumn index (zero based) of the column to read for synonyms. Set to -1 to ignore.
+	 * @param discovercolumnobjectlink
+	 * @param discoverenrichsynonyms
+	 * @param readSynonymFromColumn when true synonym terms are read from the column at synonymsColumn index
+	 * @param separatorPattern the field separator character compiled as a Pattern instance. Must not be null.
+	 * @param reader an open reader on CSV content. Must not be null.
+	 * @throws IOException when an read error occurred
+	 */
+	protected static void discoverFromCSVReader(final Map<String, Tagging.SOTuple> table, final String escapeChar,
+			final int lineStart, final int literalsColumn, final int synonymsColumn,
+			final int discovercolumnobjectlink, final boolean discoverenrichsynonyms, final boolean readSynonymFromColumn,
+			final Pattern separatorPattern, final BufferedReader reader) throws IOException {
+		String line = null;
+		final StringBuilder multiLineContent = new StringBuilder();
+		final Map<String, String> synonym2literal = new HashMap<>(); // helper map to check if there are double synonyms
+		int lineIndex = -1;
+		while ((line = reader.readLine()) != null) {
+			lineIndex++;
+			if(lineIndex < lineStart) {
+				continue;
+			}
+		    if (line.length() == 0) {
+		    	continue;
+		    }
+		    
+			final List<String> fields = parseCSVLine(line, separatorPattern, escapeChar, multiLineContent);
+			if (multiLineContent.length() > 0) {
+				continue;
+			}
+		    
+		    
+		    String literal = literalsColumn < 0 || fields.size() <= literalsColumn ? null : fields.get(literalsColumn).trim();
+		    if (literal == null) {
+		    	continue;
+		    }
+		    literal = normalizeLiteral(literal, escapeChar);
+		    final String objectlink = discovercolumnobjectlink < 0 || fields.size() <= discovercolumnobjectlink ? null : fields.get(discovercolumnobjectlink).trim();
+		    if (literal.length() > 0) {
+		        String synonyms = "";
+		        if (discoverenrichsynonyms) {
+		            final Set<String> sy = SynonymLibrary.getSynonyms(literal);
+		            if (sy != null) {
+		                for (final String s: sy) {
+		                	synonyms += "," + s;
+		                }
+		            }
+		        } else if (readSynonymFromColumn) {
+		            synonyms = synonymsColumn < 0 || fields.size() <= synonymsColumn ? null : fields.get(synonymsColumn).trim();
+		            synonyms = normalizeLiteral(synonyms, escapeChar);
+		        } else {
+		            synonyms = Tagging.normalizeTerm(literal);
+		        }
+		        // check double synonyms
+		        if (synonyms.length() > 0) {
+		            String oldliteral = synonym2literal.get(synonyms);
+		            if (oldliteral != null && !literal.equals(oldliteral)) {
+		                // replace old entry with combined new
+		                table.remove(oldliteral);
+		                String newliteral = oldliteral + "," + literal;
+		                literal = newliteral;
+		            }
+		            synonym2literal.put(synonyms, literal);
+		        }
+		        // store term
+		        table.put(literal, new Tagging.SOTuple(synonyms, objectlink == null ? "" : objectlink));
+		    }
+		}
+	}
+    
+    private static String normalizeLiteral(String literal, final String escapeChar) {
+        if (literal == null) {
+        	return "";
+        }
+        if(literal.length() > 1 && literal.startsWith(escapeChar) && literal.endsWith(escapeChar)) {
+            literal = literal.replace("\"\"", "\"");
+        }
+        if (literal.length() > 0 && (literal.charAt(0) == '"' || literal.charAt(0) == '\'')) {
+        	literal = literal.substring(1);
+        }
+        if (literal.length() > 0 && (literal.charAt(literal.length() - 1) == '"' || literal.charAt(literal.length() - 1) == '\'')) {
+        	literal = literal.substring(0, literal.length() - 1);
+        }
        return literal;
    }
 }
--- a/test/Vocabulary_pTest.java
+++ b/test/Vocabulary_pTest.java
@ -0,0 +1,94 @@
+
+// Vocabulary_pTest.java
+// ---------------------------
+// Copyright 2018 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import net.yacy.cora.lod.vocabulary.Tagging;
+
+/**
+ * Unit tests for the {@link Vocabulary_p} class.
+ */
+public class Vocabulary_pTest {
+
+	/**
+	 * Unit test for the discoverFromCSVReader() function.
+	 * @throws IOException when a read/write error occurred
+	 */
+	@Test
+	public void testDiscoverFromCSVReader() throws IOException {
+		final Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
+		final Pattern separatorPattern = Pattern.compile(",");
+		final String escapeChar = "\"";
+		final int lineStart = 1;
+		final int literalsColumn = 1;
+		final int synonymsColumn = -1;
+		final int objectLinkColumn = -1;
+		final boolean enrichSynonyms = false;
+		final boolean readSynonymFromColumn = false;
+		
+		/* Test content with typical cases */
+		final StringBuilder csvBuilder = new StringBuilder();
+		csvBuilder.append("title1,title2,title3\n"); // header line
+	    csvBuilder.append("\"aaa\",\"b bb\",\"ccc\"\n"); // all fields enclosed in double quotes 
+	    csvBuilder.append("zzz,yyy,xxx\n"); // no double quotes around fields
+	    csvBuilder.append(",,\n"); // empty fields
+	    csvBuilder.append("\"\",\"\",\"\"\n"); // quoted empty fields
+	    csvBuilder.append("\"111\",\"2\"\"2'2\",\"333\"\n"); // escaped double quote
+	    csvBuilder.append("key,quo\"te,desc\n"); // unescaped double quote inside a field value
+	    csvBuilder.append("fff,malformed\n"); // malformed line : one field is missing 
+	    csvBuilder.append("\"f,f,f\",\"foo,bar\",\",,\"\n"); // escaped separators
+	    csvBuilder.append("\"m\nm\nmm\",\"multi\nline\",\"\n\n\"\n"); // escaped line breaks
+
+		try (final BufferedReader reader = new BufferedReader(new StringReader(csvBuilder.toString()));) {
+
+			Vocabulary_p.discoverFromCSVReader(table, escapeChar, lineStart, literalsColumn, synonymsColumn,
+					objectLinkColumn, enrichSynonyms, readSynonymFromColumn, separatorPattern, reader);
+			Assert.assertEquals(7, table.size());
+			Assert.assertTrue(table.containsKey("b bb"));
+			Assert.assertTrue(table.containsKey("yyy"));
+			Assert.assertTrue(table.containsKey("2\"2'2"));
+			Assert.assertTrue(table.containsKey("quo\"te"));
+			Assert.assertTrue(table.containsKey("malformed"));
+			Assert.assertTrue(table.containsKey("foo,bar"));
+			Assert.assertTrue(table.containsKey("multi\nline"));
+		}
+		
+		/* Empty content */
+		table.clear();
+		try (final BufferedReader reader = new BufferedReader(new StringReader(""));) {
+
+			Vocabulary_p.discoverFromCSVReader(table, escapeChar, lineStart, literalsColumn, synonymsColumn,
+					objectLinkColumn, enrichSynonyms, readSynonymFromColumn, separatorPattern, reader);
+			Assert.assertEquals(0, table.size());
+		}
+	}
+
+}