From 6a2a669db4c245967fcea393686e33419599c785 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 19 Nov 2014 17:36:56 +0100 Subject: [PATCH] added loading of the synonyms file from addon/synonyms into the knowledge loader --- htroot/DictionaryLoader_p.html | 17 ++++++++++++++ htroot/DictionaryLoader_p.java | 21 ++++++++++++++++++ .../language/synonyms/SynonymLibrary.java | 22 +++++++++---------- .../net/yacy/data/ymark/YMarkAutoTagger.java | 2 +- source/net/yacy/document/Condenser.java | 5 ++--- source/net/yacy/document/LibraryProvider.java | 3 +-- .../yacy/document/parser/torrentParser.java | 2 +- source/net/yacy/search/Switchboard.java | 4 ++-- .../net/yacy/search/index/DocumentIndex.java | 2 +- source/net/yacy/search/index/Segment.java | 2 +- 10 files changed, 58 insertions(+), 22 deletions(-) diff --git a/htroot/DictionaryLoader_p.html b/htroot/DictionaryLoader_p.html index 2a0e2e409..cf375d5af 100644 --- a/htroot/DictionaryLoader_p.html +++ b/htroot/DictionaryLoader_p.html @@ -206,6 +206,23 @@ + +
+
+ Synonyms + Synonyms are used to find not only the searched word but also their synonyms. This is done by adding all synonyms of words in documents to the document and searching the synonyms as well. + +

OpenThesaurus - German Thesaurus from http://www.openthesaurus.de

+

The data from this source was converted to the YaCy synonym file format and part of the YaCy distribution.

+ +
+
+
#(syn0Status)#
Deactivated
::
Activated
#(/syn0Status)#
+
Action
+
#(syn0Status)#::#(/syn0Status)#
+
+
+
#%env/templates/footer.template%# diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index 7cfbb5a40..58c056a09 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -18,6 +18,8 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.net.MalformedURLException; @@ -25,6 +27,7 @@ import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeonamesLocation; import net.yacy.cora.geo.OpenGeoDBLocation; +import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; @@ -274,6 +277,24 @@ public class DictionaryLoader_p { LibraryProvider.initDidYouMean(); prop.put("drw0ActionActivated", 1); } + + final File synonym_de_default = new File(new File(new File(sb.appPath, "addon"), "synonyms"), "openthesaurus_de_yacy"); + final File synonyms_path = new File(sb.dictionariesPath, LibraryProvider.path_to_synonym_dictionaries); + final File synonym_de_production = new File(synonyms_path, synonym_de_default.getName()); + if (post.containsKey("syn0Deactivate")) { + synonym_de_production.delete(); + SynonymLibrary.init(synonyms_path); + } + + if (post.containsKey("syn0Activate")) { + try { + FileUtils.copy(new FileInputStream(synonym_de_default), synonym_de_production); + } catch (IOException e) { + ConcurrentLog.logException(e); + } + SynonymLibrary.init(synonyms_path); + } + prop.put("syn0Status", synonym_de_production.exists() ? 1 : 0); // check status again boolean keepPlacesTagging = false; diff --git a/source/net/yacy/cora/language/synonyms/SynonymLibrary.java b/source/net/yacy/cora/language/synonyms/SynonymLibrary.java index e55fbe15c..ecd39a808 100644 --- a/source/net/yacy/cora/language/synonyms/SynonymLibrary.java +++ b/source/net/yacy/cora/language/synonyms/SynonymLibrary.java @@ -43,10 +43,10 @@ import net.yacy.cora.util.ConcurrentLog; public class SynonymLibrary { private final static ConcurrentLog log = new ConcurrentLog(SynonymLibrary.class.getName()); - private Map>> lib; - - public SynonymLibrary(final File path) { - this.lib = new HashMap>>(); + private final static Map>> lib = new HashMap>>(); + + public static void init(final File path) { + lib.clear(); if (!path.exists() || !path.isDirectory()) return; final String[] files = path.list(); for (final String f: files) { @@ -70,10 +70,10 @@ public class SynonymLibrary { keys.add(t.substring(0, 2)); } for (String key: keys) { - List> symsetlist = this.lib.get(key); + List> symsetlist = lib.get(key); if (symsetlist == null) { symsetlist = new ArrayList>(); - this.lib.put(key, symsetlist); + lib.put(key, symsetlist); } symsetlist.add(synonyms); } @@ -83,9 +83,9 @@ public class SynonymLibrary { } } } - - public int size() { - return this.lib.size(); + + public static int size() { + return lib.size(); } /** @@ -93,11 +93,11 @@ public class SynonymLibrary { * @param word * @return a list of synonyms bot without the requested word */ - public Set getSynonyms(String word) { + public static Set getSynonyms(String word) { word = word.toLowerCase(); if (word.length() < 2) return null; String key = word.substring(0, 2); - List> symsetlist = this.lib.get(key); + List> symsetlist = lib.get(key); if (symsetlist == null) return null; for (Set symset: symsetlist) { if (symset.contains(word)) { diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index 25d9eb9a1..0d49dffe3 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -86,7 +86,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } //get words from document - final Map words = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false).words(); + final Map words = new Condenser(document, true, true, LibraryProvider.dymLib, false).words(); // generate potential tags from document title, description and subject final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index b4e4d3a3e..982dbc9a3 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -91,7 +91,6 @@ public final class Condenser { final boolean indexText, final boolean indexMedia, final WordCache meaningLib, - final SynonymLibrary synlib, final boolean doAutotagging ) { Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging @@ -221,9 +220,9 @@ public final class Condenser { } // create the synonyms set - if (synlib != null && synlib.size() > 0) { + if (SynonymLibrary.size() > 0) { for (String word: this.words.keySet()) { - Set syms = synlib.getSynonyms(word); + Set syms = SynonymLibrary.getSynonyms(word); if (syms != null) this.synonyms.addAll(syms); } } diff --git a/source/net/yacy/document/LibraryProvider.java b/source/net/yacy/document/LibraryProvider.java index 668076279..0fa434a82 100644 --- a/source/net/yacy/document/LibraryProvider.java +++ b/source/net/yacy/document/LibraryProvider.java @@ -62,7 +62,6 @@ public class LibraryProvider { public static WordCache dymLib = new WordCache(null); public static AutotaggingLibrary autotagging = null; - public static SynonymLibrary synonyms = null; public static URLRewriterLibrary urlRewriter = null; public static OverarchingLocation geoLoc = new OverarchingLocation(); private static File dictSource = null; @@ -187,7 +186,7 @@ public class LibraryProvider { if ( !synonymPath.exists() ) { synonymPath.mkdirs(); } - synonyms = new SynonymLibrary(synonymPath); + SynonymLibrary.init(synonymPath); } public static void initRewriter() { final File rewriterPath = new File(dictRoot, path_to_rewriter_dictionaries); diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index cf522ca04..08191a6dc 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -120,7 +120,7 @@ public class torrentParser extends AbstractParser implements Parser { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); - Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false); + Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); } catch (final IOException e) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index f687a1940..72fa47dbc 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2736,7 +2736,7 @@ public final class Switchboard extends serverSwitch { new Condenser( in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), - LibraryProvider.dymLib, LibraryProvider.synonyms, true); + LibraryProvider.dymLib, true); // update image result list statistics // its good to do this concurrently here, because it needs a DNS lookup @@ -3171,7 +3171,7 @@ public final class Switchboard extends serverSwitch { if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) { throw new Parser.Failure("indexing is denied", url); } - final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true); + final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true); ResultImages.registerImages(url, document, true); Switchboard.this.webStructure.generateCitationReference(url, document); storeDocumentIndex( diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 77eb7120e..949b165dc 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -158,7 +158,7 @@ public class DocumentIndex extends Segment { int c = 0; for ( final Document document : documents ) { if (document == null) continue; - final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true); + final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true); rows[c++] = super.storeDocument( url, diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index bb6b5020c..063858c09 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -744,7 +744,7 @@ public class Segment { } // get the word set Set words = null; - words = new Condenser(document, true, true, null, null, false).words().keySet(); + words = new Condenser(document, true, true, null, false).words().keySet(); // delete all word references int count = 0;