added loading of the synonyms file from addon/synonyms into the

knowledge loader
pull/1/head
Michael Peter Christen 10 years ago
parent c67c5c0709
commit 6a2a669db4

@ -206,6 +206,23 @@
</dl> </dl>
</fieldset> </fieldset>
</form> </form>
<form action="DictionaryLoader_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>Synonyms</legend>
Synonyms are used to find not only the searched word but also their synonyms. This is done by adding all synonyms of words in documents to the document and searching the synonyms as well.
<h4><a href="http://www.openthesaurus.de" target="_blank">OpenThesaurus - German Thesaurus from http://www.openthesaurus.de</a></h4>
<p>The data from this source was converted to the YaCy synonym file format and part of the YaCy distribution.</p>
<dl>
<dt><label>Status</label></dt>
<dd>#(syn0Status)#<div class="info">Deactivated</div>::<div class="commit">Activated</div>#(/syn0Status)#</dd>
<dt>Action</dt>
<dd>#(syn0Status)#<input type="submit" name="syn0Activate" value="Activate" />::<input type="submit" name="syn0Deactivate" value="Deactivate" />#(/syn0Status)#</dd>
</dl>
</fieldset>
</form>
#%env/templates/footer.template%# #%env/templates/footer.template%#
</body> </body>
</html> </html>

@ -18,6 +18,8 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
@ -25,6 +27,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeonamesLocation; import net.yacy.cora.geo.GeonamesLocation;
import net.yacy.cora.geo.OpenGeoDBLocation; import net.yacy.cora.geo.OpenGeoDBLocation;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
@ -275,6 +278,24 @@ public class DictionaryLoader_p {
prop.put("drw0ActionActivated", 1); prop.put("drw0ActionActivated", 1);
} }
final File synonym_de_default = new File(new File(new File(sb.appPath, "addon"), "synonyms"), "openthesaurus_de_yacy");
final File synonyms_path = new File(sb.dictionariesPath, LibraryProvider.path_to_synonym_dictionaries);
final File synonym_de_production = new File(synonyms_path, synonym_de_default.getName());
if (post.containsKey("syn0Deactivate")) {
synonym_de_production.delete();
SynonymLibrary.init(synonyms_path);
}
if (post.containsKey("syn0Activate")) {
try {
FileUtils.copy(new FileInputStream(synonym_de_default), synonym_de_production);
} catch (IOException e) {
ConcurrentLog.logException(e);
}
SynonymLibrary.init(synonyms_path);
}
prop.put("syn0Status", synonym_de_production.exists() ? 1 : 0);
// check status again // check status again
boolean keepPlacesTagging = false; boolean keepPlacesTagging = false;
for (final LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) { for (final LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) {

@ -43,10 +43,10 @@ import net.yacy.cora.util.ConcurrentLog;
public class SynonymLibrary { public class SynonymLibrary {
private final static ConcurrentLog log = new ConcurrentLog(SynonymLibrary.class.getName()); private final static ConcurrentLog log = new ConcurrentLog(SynonymLibrary.class.getName());
private Map<String, List<Set<String>>> lib; private final static Map<String, List<Set<String>>> lib = new HashMap<String, List<Set<String>>>();
public SynonymLibrary(final File path) { public static void init(final File path) {
this.lib = new HashMap<String, List<Set<String>>>(); lib.clear();
if (!path.exists() || !path.isDirectory()) return; if (!path.exists() || !path.isDirectory()) return;
final String[] files = path.list(); final String[] files = path.list();
for (final String f: files) { for (final String f: files) {
@ -70,10 +70,10 @@ public class SynonymLibrary {
keys.add(t.substring(0, 2)); keys.add(t.substring(0, 2));
} }
for (String key: keys) { for (String key: keys) {
List<Set<String>> symsetlist = this.lib.get(key); List<Set<String>> symsetlist = lib.get(key);
if (symsetlist == null) { if (symsetlist == null) {
symsetlist = new ArrayList<Set<String>>(); symsetlist = new ArrayList<Set<String>>();
this.lib.put(key, symsetlist); lib.put(key, symsetlist);
} }
symsetlist.add(synonyms); symsetlist.add(synonyms);
} }
@ -84,8 +84,8 @@ public class SynonymLibrary {
} }
} }
public int size() { public static int size() {
return this.lib.size(); return lib.size();
} }
/** /**
@ -93,11 +93,11 @@ public class SynonymLibrary {
* @param word * @param word
* @return a list of synonyms bot without the requested word * @return a list of synonyms bot without the requested word
*/ */
public Set<String> getSynonyms(String word) { public static Set<String> getSynonyms(String word) {
word = word.toLowerCase(); word = word.toLowerCase();
if (word.length() < 2) return null; if (word.length() < 2) return null;
String key = word.substring(0, 2); String key = word.substring(0, 2);
List<Set<String>> symsetlist = this.lib.get(key); List<Set<String>> symsetlist = lib.get(key);
if (symsetlist == null) return null; if (symsetlist == null) return null;
for (Set<String> symset: symsetlist) { for (Set<String> symset: symsetlist) {
if (symset.contains(word)) { if (symset.contains(word)) {

@ -86,7 +86,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
} }
//get words from document //get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false).words(); final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false).words();
// generate potential tags from document title, description and subject // generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;

@ -91,7 +91,6 @@ public final class Condenser {
final boolean indexText, final boolean indexText,
final boolean indexMedia, final boolean indexMedia,
final WordCache meaningLib, final WordCache meaningLib,
final SynonymLibrary synlib,
final boolean doAutotagging final boolean doAutotagging
) { ) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
@ -221,9 +220,9 @@ public final class Condenser {
} }
// create the synonyms set // create the synonyms set
if (synlib != null && synlib.size() > 0) { if (SynonymLibrary.size() > 0) {
for (String word: this.words.keySet()) { for (String word: this.words.keySet()) {
Set<String> syms = synlib.getSynonyms(word); Set<String> syms = SynonymLibrary.getSynonyms(word);
if (syms != null) this.synonyms.addAll(syms); if (syms != null) this.synonyms.addAll(syms);
} }
} }

@ -62,7 +62,6 @@ public class LibraryProvider {
public static WordCache dymLib = new WordCache(null); public static WordCache dymLib = new WordCache(null);
public static AutotaggingLibrary autotagging = null; public static AutotaggingLibrary autotagging = null;
public static SynonymLibrary synonyms = null;
public static URLRewriterLibrary urlRewriter = null; public static URLRewriterLibrary urlRewriter = null;
public static OverarchingLocation geoLoc = new OverarchingLocation(); public static OverarchingLocation geoLoc = new OverarchingLocation();
private static File dictSource = null; private static File dictSource = null;
@ -187,7 +186,7 @@ public class LibraryProvider {
if ( !synonymPath.exists() ) { if ( !synonymPath.exists() ) {
synonymPath.mkdirs(); synonymPath.mkdirs();
} }
synonyms = new SynonymLibrary(synonymPath); SynonymLibrary.init(synonymPath);
} }
public static void initRewriter() { public static void initRewriter() {
final File rewriterPath = new File(dictRoot, path_to_rewriter_dictionaries); final File rewriterPath = new File(dictRoot, path_to_rewriter_dictionaries);

@ -120,7 +120,7 @@ public class torrentParser extends AbstractParser implements Parser {
byte[] b = FileUtils.read(new File(args[0])); byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser(); torrentParser parser = new torrentParser();
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false); Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false);
Map<String, Word> w = c.words(); Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (final IOException e) { } catch (final IOException e) {

@ -2736,7 +2736,7 @@ public final class Switchboard extends serverSwitch {
new Condenser( new Condenser(
in.documents[i], in.queueEntry.profile().indexText(), in.documents[i], in.queueEntry.profile().indexText(),
in.queueEntry.profile().indexMedia(), in.queueEntry.profile().indexMedia(),
LibraryProvider.dymLib, LibraryProvider.synonyms, true); LibraryProvider.dymLib, true);
// update image result list statistics // update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup // its good to do this concurrently here, because it needs a DNS lookup
@ -3171,7 +3171,7 @@ public final class Switchboard extends serverSwitch {
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) { if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url); throw new Parser.Failure("indexing is denied", url);
} }
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true); final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
ResultImages.registerImages(url, document, true); ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document); Switchboard.this.webStructure.generateCitationReference(url, document);
storeDocumentIndex( storeDocumentIndex(

@ -158,7 +158,7 @@ public class DocumentIndex extends Segment {
int c = 0; int c = 0;
for ( final Document document : documents ) { for ( final Document document : documents ) {
if (document == null) continue; if (document == null) continue;
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true); final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
rows[c++] = rows[c++] =
super.storeDocument( super.storeDocument(
url, url,

@ -744,7 +744,7 @@ public class Segment {
} }
// get the word set // get the word set
Set<String> words = null; Set<String> words = null;
words = new Condenser(document, true, true, null, null, false).words().keySet(); words = new Condenser(document, true, true, null, false).words().keySet();
// delete all word references // delete all word references
int count = 0; int count = 0;

Loading…
Cancel
Save