added new options to vocabulary editor:

- new switch 'isFacet' which causes that the usage of the vocabulary for
search facets is enabled or disabled. This shall be used for large
vocabularies sind searched in solr are extremely slow if facets for a
large set of alternative terms are generated
- new option to disable auto-enrichment from synonyms
- new option to add synonyms from another column when importing from csv
- automatically recognize double-occurrences in synonyms and bundling
terms for such synonyms
pull/1/head
Michael Peter Christen 10 years ago
parent 87b53b3572
commit 7bfc5b80cb

@ -103,12 +103,12 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<!--<form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" onkeyup="xmlhttpPost(); return false;">--> <!--<form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" onkeyup="xmlhttpPost(); return false;">-->
<form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" > <form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" >
<fieldset><legend>Vocabulary Production</legend> <fieldset><legend>Vocabulary Production</legend>
It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub.
This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
This works best with wikis. Try to use a wiki url as objectspace path.
<dl> <dl>
<dt>Vocabulary Name<br><i>(this will become the name of a search facet)</i></dt> <dt>Vocabulary Name</dt>
<dd><input type="text" name="discovername" value="" size="78" maxlength="128" onkeydown="for (i=0; i < 6; i++) document.getElementsByName('discovermethod')[i].disabled=''"/></dd> <dd><input type="text" name="discovername" value="" size="78" maxlength="128" onkeydown="for (i=0; i < 6; i++) document.getElementsByName('discovermethod')[i].disabled=''"/></dd>
<dt></dt>
<dd><input type="checkbox" name="isFacet" checked="checked"/> this shall be a search facet (disable this for large vocabularies!)</dd>
<hr> <hr>
<dt>Empty Vocabulary&nbsp;<input type="radio" name="discovermethod" value="none" checked="checked" disabled="disabled"/></dt> <dt>Empty Vocabulary&nbsp;<input type="radio" name="discovermethod" value="none" checked="checked" disabled="disabled"/></dt>
<dd></dd> <dd></dd>
@ -119,23 +119,31 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
from page author&nbsp;<input type="radio" name="discovermethod" value="author" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/></dt> from page author&nbsp;<input type="radio" name="discovermethod" value="author" disabled="disabled" onclick="document.getElementById('discoverobjectspace').disabled=''"/></dt>
<dd> <dd>
<dl> <dl>
<dt></dt>
<dd>It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub.
This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
This works best with wikis. Try to use a wiki url as objectspace path.</dd>
<dt><i>Objectspace</i></dt> <dt><i>Objectspace</i></dt>
<dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd> <dd><input type="text" id="discoverobjectspace" name="discoverobjectspace" value="http://" size="78" maxlength="512" disabled="disabled"/><div id="searchresults"></div></dd>
</dl> </dl>
</dd> </dd>
<dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt> <dt>Import from a csv file&nbsp;<input type="radio" name="discovermethod" value="csv" disabled="disabled" onclick="document.getElementById('discoverpath').disabled='';document.getElementById('discovercolumnliteral').disabled='';document.getElementById('discovercolumnsynonyms').disabled='';document.getElementById('discovercolumnobjectlink').disabled='';"/></dt>
<dd> <dd>
<dl> <dl>
<dt><i>File Path</i></dt> <dt><i>File Path</i></dt>
<dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd> <dd><input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled"></dd>
<dt><i>Column for Literals</i></dt> <dt><i>Column for Literals</i></dt>
<dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd> <dd><input type="number" id="discovercolumnliteral" name="discovercolumnliteral" min="0" max="99" step="1" size="2" value="0" disabled="disabled" style="width:50px;"> (first has index 0)</dd>
<dt><i>Synonyms</i></dt>
<dd><input type="radio" name="discoversynonymsmethod" id="discoversynonymsmethod" value="none" checked="checked"/> <i>no Synonyms</i><br/>
<input type="radio" name="discoversynonymsmethod" id="discoversynonymsmethod" value="enrichsynonyms"/> <i>Auto-Enrich with Synonyms from Stemming Library</i><br/>
<input type="radio" name="discoversynonymsmethod" id="discoversynonymsmethod" value="readcolumn" onclick="document.getElementById('discovercolumnsynonyms').value=parseInt(document.getElementById('discovercolumnliteral').value)+1;"/> <i>Read Column</i><br/>
<input type="number" id="discovercolumnsynonyms" name="discovercolumnsynonyms" min="-1" max="99" step="1" size="2" value="-1" style="width:50px;"> (first has index 0)
</dd>
<dt><i>Column for Object Link (optional)</i></dt> <dt><i>Column for Object Link (optional)</i></dt>
<dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd> <dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
<dt><i>Charset of Import File</i></dt> <dt><i>Charset of Import File</i></dt>
<dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd> <dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
<dt><i>Auto-Enrich with Synonyms from Stemming Library</i></dt>
<dd><input type="checkbox" name="discoverenrichsynonyms" id="discoverenrichsynonyms" checked="checked" /></dd>
</dl> </dl>
</dd> </dd>
<dt></dt><dd><input type="submit" name="create" value="Create" /></dd> <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
@ -155,6 +163,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<dt>Predicate</dt><dd>#[predicate]#</dd> <dt>Predicate</dt><dd>#[predicate]#</dd>
<dt>Prefix</dt><dd>#[prefix]#</dd> <dt>Prefix</dt><dd>#[prefix]#</dd>
<dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd> <dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
<dt>Is Facet?</dt><dd><input type="checkbox" name="isFacet"#(isFacet)#:: checked="checked"#(/isFacet)#/> (If checked, this vocabulary is used for search facets. Not feasible for large vocabularies!)</dd>
</dl> </dl>
<table class="sortable" border="0"> <table class="sortable" border="0">
<tr class="TableHeader" valign="bottom"> <tr class="TableHeader" valign="bottom">

@ -26,6 +26,7 @@ import java.io.InputStreamReader;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
@ -72,18 +73,21 @@ public class Vocabulary_p {
if (discoveruri == null) discoverobjectspace = ""; if (discoveruri == null) discoverobjectspace = "";
Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>(); Map<String, Tagging.SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
final boolean isFacet = post.getBoolean("isFacet");
final boolean discoverNot = post.get("discovermethod", "").equals("none"); final boolean discoverNot = post.get("discovermethod", "").equals("none");
final boolean discoverFromPath = post.get("discovermethod", "").equals("path"); final boolean discoverFromPath = post.get("discovermethod", "").equals("path");
final boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); final boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv"); final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
final String discoverFromCSVPath = post.get("discoverpath", ""); final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
final String discoverFromCSVCharset = post.get("charset", "UTF-8"); final String discoverFromCSVCharset = post.get("charset", "UTF-8");
final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0); final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1);
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1); final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null; final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
final boolean discoverenrichsynonyms = post.getBoolean("discoverenrichsynonyms"); final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms");
final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn");
Segment segment = sb.index; Segment segment = sb.index;
String t; String t;
if (!discoverNot) { if (!discoverNot) {
@ -91,23 +95,40 @@ public class Vocabulary_p {
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset)); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
String line = null; String line = null;
Pattern semicolon = Pattern.compile(";"); Pattern semicolon = Pattern.compile(";");
Map<String, String> synonym2literal = new HashMap<>(); // helper map to check if there are double synonyms
while ((line = r.readLine()) != null) { while ((line = r.readLine()) != null) {
if (line.length() == 0) continue; if (line.length() == 0) continue;
String[] l = semicolon.split(line); String[] l = semicolon.split(line);
if (l.length == 0) l = new String[]{line}; if (l.length == 0) l = new String[]{line};
String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral].trim(); String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral].trim();
if (literal == null) continue; if (literal == null) continue;
if (literal.length() > 0 && (literal.charAt(0) == '"' || literal.charAt(0) == '\'')) literal = literal.substring(1); literal = normalizeLiteral(literal);
if (literal.length() > 0 && (literal.charAt(literal.length() - 1) == '"' || literal.charAt(literal.length() - 1) == '\'')) literal = literal.substring(0, literal.length() - 1);
String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink].trim(); String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink].trim();
if (literal.length() > 0) { if (literal.length() > 0) {
String synonyms = Tagging.normalizeTerm(literal); String synonyms = "";
if (discoverenrichsynonyms) { if (discoverenrichsynonyms) {
Set<String> sy = SynonymLibrary.getSynonyms(literal); Set<String> sy = SynonymLibrary.getSynonyms(literal);
if (sy != null) { if (sy != null) {
for (String s: sy) synonyms += "," + s; for (String s: sy) synonyms += "," + s;
} }
} else if (discoverreadcolumn) {
synonyms = discovercolumnsynonyms < 0 || l.length <= discovercolumnsynonyms ? null : l[discovercolumnsynonyms].trim();
synonyms = normalizeLiteral(synonyms);
} else {
synonyms = Tagging.normalizeTerm(literal);
} }
// check double synonyms
if (synonyms.length() > 0) {
String oldliteral = synonym2literal.get(synonyms);
if (oldliteral != null) {
// replace old entry with combined new
table.remove(oldliteral);
String newliteral = oldliteral + "," + literal;
literal = newliteral;
}
synonym2literal.put(synonyms, literal);
}
// store term
table.put(literal, new Tagging.SOTuple(synonyms, objectlink == null ? "" : objectlink)); table.put(literal, new Tagging.SOTuple(synonyms, objectlink == null ? "" : objectlink));
} }
} }
@ -160,6 +181,7 @@ public class Vocabulary_p {
} }
} }
Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
newvoc.setFacet(isFacet);
LibraryProvider.autotagging.addVocabulary(newvoc); LibraryProvider.autotagging.addVocabulary(newvoc);
vocabularyName = discovername; vocabularyName = discovername;
vocabulary = newvoc; vocabulary = newvoc;
@ -205,6 +227,11 @@ public class Vocabulary_p {
vocabulary = null; vocabulary = null;
vocabularyName = null; vocabularyName = null;
} }
// check the isFacet property
if (vocabulary != null && post.containsKey("isFacet")) {
vocabulary.setFacet(post.getBoolean("isFacet"));
}
} }
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
@ -231,6 +258,7 @@ public class Vocabulary_p {
prop.putHTML("edit_name", vocabulary.getName()); prop.putHTML("edit_name", vocabulary.getName());
prop.putXML("edit_namexml", vocabulary.getName()); prop.putXML("edit_namexml", vocabulary.getName());
prop.putHTML("edit_namespace", vocabulary.getNamespace()); prop.putHTML("edit_namespace", vocabulary.getNamespace());
prop.put("edit_isFacet", vocabulary.isFacet() ? 1 : 0);
prop.put("edit_size", vocabulary.size()); prop.put("edit_size", vocabulary.size());
prop.putHTML("edit_predicate", vocabulary.getPredicate()); prop.putHTML("edit_predicate", vocabulary.getPredicate());
prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX); prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
@ -279,4 +307,11 @@ public class Vocabulary_p {
// return rewrite properties // return rewrite properties
return prop; return prop;
} }
private static String normalizeLiteral(String literal) {
if (literal == null) return "";
if (literal.length() > 0 && (literal.charAt(0) == '"' || literal.charAt(0) == '\'')) literal = literal.substring(1);
if (literal.length() > 0 && (literal.charAt(literal.length() - 1) == '"' || literal.charAt(literal.length() - 1) == '\'')) literal = literal.substring(0, literal.length() - 1);
return literal;
}
} }

@ -53,6 +53,7 @@ public class Tagging {
private final Map<String, String> term2objectlink; private final Map<String, String> term2objectlink;
private final Map<String, Set<String>> synonym2synonyms; private final Map<String, Set<String>> synonym2synonyms;
private File propFile; private File propFile;
private boolean isFacet; // true if the vocabulary shall generate a navigation facet
private String predicate, namespace, objectspace; private String predicate, namespace, objectspace;
@ -99,6 +100,7 @@ public class Tagging {
this.predicate = this.namespace + name; this.predicate = this.namespace + name;
this.objectspace = null; this.objectspace = null;
this.propFile = null; this.propFile = null;
this.isFacet = true;
} }
public Tagging(String name, File propFile) throws IOException { public Tagging(String name, File propFile) throws IOException {
@ -263,6 +265,14 @@ public class Tagging {
} }
} }
public boolean isFacet() {
return this.isFacet;
}
public void setFacet(boolean isFacet) {
this.isFacet = isFacet;
}
public int size() { public int size() {
return this.term2objectlink.size(); return this.term2objectlink.size();
} }
@ -346,7 +356,7 @@ public class Tagging {
public void setObjectspace(String os) throws IOException { public void setObjectspace(String os) throws IOException {
if (this.propFile == null) return; if (this.propFile == null) return;
if (os == null || (this.objectspace != null && this.objectspace.equals(os))) return; if (os == null || os.length() == 0 || (this.objectspace != null && this.objectspace.equals(os))) return;
this.objectspace = os; this.objectspace = os;
File tmp = tmpFile(); File tmp = tmpFile();
BufferedWriter w = new BufferedWriter(new FileWriter(tmp)); BufferedWriter w = new BufferedWriter(new FileWriter(tmp));

@ -230,7 +230,11 @@ public final class QueryParams {
// handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield)) // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield))
if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName()); if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName());
} }
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
if (v.isFacet()) {
this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
}
}
this.maxfacets = defaultmaxfacets; this.maxfacets = defaultmaxfacets;
this.cachedQuery = null; this.cachedQuery = null;
} }

Loading…
Cancel
Save