added loading of the synonyms file from addon/synonyms into the

knowledge loader
pull/1/head
Michael Peter Christen 10 years ago
parent c67c5c0709
commit 6a2a669db4

@ -206,6 +206,23 @@
</dl>
</fieldset>
</form>
<form action="DictionaryLoader_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>Synonyms</legend>
Synonyms are used to find not only the searched word but also their synonyms. This is done by adding all synonyms of words in documents to the document and searching the synonyms as well.
<h4><a href="http://www.openthesaurus.de" target="_blank">OpenThesaurus - German Thesaurus from http://www.openthesaurus.de</a></h4>
<p>The data from this source was converted to the YaCy synonym file format and part of the YaCy distribution.</p>
<dl>
<dt><label>Status</label></dt>
<dd>#(syn0Status)#<div class="info">Deactivated</div>::<div class="commit">Activated</div>#(/syn0Status)#</dd>
<dt>Action</dt>
<dd>#(syn0Status)#<input type="submit" name="syn0Activate" value="Activate" />::<input type="submit" name="syn0Deactivate" value="Deactivate" />#(/syn0Status)#</dd>
</dl>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -18,6 +18,8 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
@ -25,6 +27,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeonamesLocation;
import net.yacy.cora.geo.OpenGeoDBLocation;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -274,6 +277,24 @@ public class DictionaryLoader_p {
LibraryProvider.initDidYouMean();
prop.put("drw0ActionActivated", 1);
}
final File synonym_de_default = new File(new File(new File(sb.appPath, "addon"), "synonyms"), "openthesaurus_de_yacy");
final File synonyms_path = new File(sb.dictionariesPath, LibraryProvider.path_to_synonym_dictionaries);
final File synonym_de_production = new File(synonyms_path, synonym_de_default.getName());
if (post.containsKey("syn0Deactivate")) {
synonym_de_production.delete();
SynonymLibrary.init(synonyms_path);
}
if (post.containsKey("syn0Activate")) {
try {
FileUtils.copy(new FileInputStream(synonym_de_default), synonym_de_production);
} catch (IOException e) {
ConcurrentLog.logException(e);
}
SynonymLibrary.init(synonyms_path);
}
prop.put("syn0Status", synonym_de_production.exists() ? 1 : 0);
// check status again
boolean keepPlacesTagging = false;

@ -43,10 +43,10 @@ import net.yacy.cora.util.ConcurrentLog;
public class SynonymLibrary {
private final static ConcurrentLog log = new ConcurrentLog(SynonymLibrary.class.getName());
private Map<String, List<Set<String>>> lib;
public SynonymLibrary(final File path) {
this.lib = new HashMap<String, List<Set<String>>>();
private final static Map<String, List<Set<String>>> lib = new HashMap<String, List<Set<String>>>();
public static void init(final File path) {
lib.clear();
if (!path.exists() || !path.isDirectory()) return;
final String[] files = path.list();
for (final String f: files) {
@ -70,10 +70,10 @@ public class SynonymLibrary {
keys.add(t.substring(0, 2));
}
for (String key: keys) {
List<Set<String>> symsetlist = this.lib.get(key);
List<Set<String>> symsetlist = lib.get(key);
if (symsetlist == null) {
symsetlist = new ArrayList<Set<String>>();
this.lib.put(key, symsetlist);
lib.put(key, symsetlist);
}
symsetlist.add(synonyms);
}
@ -83,9 +83,9 @@ public class SynonymLibrary {
}
}
}
public int size() {
return this.lib.size();
public static int size() {
return lib.size();
}
/**
@ -93,11 +93,11 @@ public class SynonymLibrary {
* @param word
* @return a list of synonyms bot without the requested word
*/
public Set<String> getSynonyms(String word) {
public static Set<String> getSynonyms(String word) {
word = word.toLowerCase();
if (word.length() < 2) return null;
String key = word.substring(0, 2);
List<Set<String>> symsetlist = this.lib.get(key);
List<Set<String>> symsetlist = lib.get(key);
if (symsetlist == null) return null;
for (Set<String> symset: symsetlist) {
if (symset.contains(word)) {

@ -86,7 +86,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
}
//get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false).words();
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false).words();
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;

@ -91,7 +91,6 @@ public final class Condenser {
final boolean indexText,
final boolean indexMedia,
final WordCache meaningLib,
final SynonymLibrary synlib,
final boolean doAutotagging
) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
@ -221,9 +220,9 @@ public final class Condenser {
}
// create the synonyms set
if (synlib != null && synlib.size() > 0) {
if (SynonymLibrary.size() > 0) {
for (String word: this.words.keySet()) {
Set<String> syms = synlib.getSynonyms(word);
Set<String> syms = SynonymLibrary.getSynonyms(word);
if (syms != null) this.synonyms.addAll(syms);
}
}

@ -62,7 +62,6 @@ public class LibraryProvider {
public static WordCache dymLib = new WordCache(null);
public static AutotaggingLibrary autotagging = null;
public static SynonymLibrary synonyms = null;
public static URLRewriterLibrary urlRewriter = null;
public static OverarchingLocation geoLoc = new OverarchingLocation();
private static File dictSource = null;
@ -187,7 +186,7 @@ public class LibraryProvider {
if ( !synonymPath.exists() ) {
synonymPath.mkdirs();
}
synonyms = new SynonymLibrary(synonymPath);
SynonymLibrary.init(synonymPath);
}
public static void initRewriter() {
final File rewriterPath = new File(dictRoot, path_to_rewriter_dictionaries);

@ -120,7 +120,7 @@ public class torrentParser extends AbstractParser implements Parser {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false);
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (final IOException e) {

@ -2736,7 +2736,7 @@ public final class Switchboard extends serverSwitch {
new Condenser(
in.documents[i], in.queueEntry.profile().indexText(),
in.queueEntry.profile().indexMedia(),
LibraryProvider.dymLib, LibraryProvider.synonyms, true);
LibraryProvider.dymLib, true);
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
@ -3171,7 +3171,7 @@ public final class Switchboard extends serverSwitch {
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url);
}
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true);
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document);
storeDocumentIndex(

@ -158,7 +158,7 @@ public class DocumentIndex extends Segment {
int c = 0;
for ( final Document document : documents ) {
if (document == null) continue;
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true);
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
rows[c++] =
super.storeDocument(
url,

@ -744,7 +744,7 @@ public class Segment {
}
// get the word set
Set<String> words = null;
words = new Condenser(document, true, true, null, null, false).words().keySet();
words = new Condenser(document, true, true, null, false).words().keySet();
// delete all word references
int count = 0;

Loading…
Cancel
Save