Added a line start field for vocabulary import from CSV file

As a convenience to ignore eventual CSV header lines
pull/167/head
luccioman 7 years ago
parent d28d612069
commit 09f93fed0e

@ -125,25 +125,27 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd>
</dl>
</dd>
<dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnsynonyms').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
<dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discoverLineStart').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnsynonyms').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';document.getElementById('discoverCharset').disabled='';document.getElementById('discoverColSeparator').disabled='';"/></dt>
<dd>
<dl>
<dt><i>File Path</i></dt>
<dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd>
<dt><i>Start line</i></dt>
<dd><input type="number" id="discoverLineStart" name="discoverLineStart" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
<dt><i>Column for Literals</i></dt>
<dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
<dt><i>Synonyms</i></dt>
<dd><input type="radio" name="discoversynonymsmethod" id="discoversynonymsmethod" value="none" checked="checked"/> <i>no Synonyms</i><br/>
<input type="radio" name="discoversynonymsmethod" id="discoversynonymsmethod" value="enrichsynonyms"/> <i>Auto-Enrich with Synonyms from Stemming Library</i><br/>
<input type="radio" name="discoversynonymsmethod" id="discoversynonymsmethod" value="readcolumn" onclick="document.getElementById('discovercolumnsynonyms').value=parseInt(document.getElementById('discovercolumnliteral').value)+1;"/> <i>Read Column</i><br/>
<input type="number" id="discovercolumnsynonyms" name="discovercolumnsynonyms" min="-1" max="99" step="1" size="2" value="-1" style="width:50px;"> (first has index 0)
<input type="number" id="discovercolumnsynonyms" name="discovercolumnsynonyms" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0)
</dd>
<dt><i>Column for Object Link (optional)</i></dt>
<dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
<dt><i>Charset of Import File</i></dt>
<dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
<dd><select name="charset" id="discoverCharset" disabled="disabled">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
<dt><i>Column separator</i></dt>
<dd><select name="columnSeparator">
<dd><select name="columnSeparator" id="discoverColSeparator" disabled="disabled">
<option value="," title="Standard CSV field delimiter">Comma ','</option>
<option value=";">Semicolon ';'</option>
</select>

@ -87,6 +87,7 @@ public class Vocabulary_p {
final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name());
final String columnSeparator = post.get("columnSeparator", ";");
final int lineStart = post.getInt("discoverLineStart", 0);
final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1);
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
@ -108,12 +109,21 @@ public class Vocabulary_p {
String line = null;
final Pattern separatorPattern = Pattern.compile(columnSeparator);
Map<String, String> synonym2literal = new HashMap<>(); // helper map to check if there are double synonyms
int lineIndex = -1;
while ((line = r.readLine()) != null) {
if (line.length() == 0) continue;
lineIndex++;
if(lineIndex < lineStart) {
continue;
}
if (line.length() == 0) {
continue;
}
String[] l = separatorPattern.split(line);
if (l.length == 0) l = new String[]{line};
String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral].trim();
if (literal == null) continue;
if (literal == null) {
continue;
}
literal = normalizeLiteral(literal);
String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink].trim();
if (literal.length() > 0) {

Loading…
Cancel
Save