diff --git a/source/net/yacy/cora/language/synonyms/SynonymLibrary.java b/source/net/yacy/cora/language/synonyms/SynonymLibrary.java new file mode 100644 index 000000000..63e564348 --- /dev/null +++ b/source/net/yacy/cora/language/synonyms/SynonymLibrary.java @@ -0,0 +1,112 @@ +/** + * Stemming + * Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * first published 01.10.2012 on http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.language.synonyms; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.BlockingQueue; + +import org.apache.log4j.Logger; + +import net.yacy.cora.storage.Files; + +/** + * Stemming library: reads stemming files and creates a mapping from words to synonyms + * Stemming files must have a list of synonym words in each line of the input file. + * The words within one line must be separated by ','. Lines starting with '#' are + * comment files and are ignored. Each line can (but does not need to) have a '{' + * at the beginning of the line and '}' at the end (which would be the GSA format). + */ +public class SynonymLibrary { + + Logger log = Logger.getLogger(SynonymLibrary.class); + private Map>> lib; + + public SynonymLibrary(final File path) { + this.lib = new HashMap>>(); + if (!path.exists() || !path.isDirectory()) return; + final String[] files = path.list(); + for (final String f: files) { + File ff = new File(path, f); + String line; + try { + BlockingQueue list = Files.concurentLineReader(ff, 1000); + while ((line = list.take()) != Files.POISON_LINE) { + line = line.trim(); + if (line.length() == 0 || line.charAt(0) == '#') continue; + if (line.charAt(line.length() - 1) == '}') line = line.substring(0, line.length() - 1); + if (line.charAt(0) == '{') line = line.substring(1); + String[] words = line.split(","); + Set synonyms = new HashSet(); + Set keys = new HashSet(); + for (String s: words) { + s = s.trim(); + if (s.length() < 2) continue; + String t = s.toLowerCase(); + synonyms.add(t); + keys.add(t.substring(0, 2)); + } + for (String key: keys) { + List> symsetlist = this.lib.get(key); + if (symsetlist == null) { + symsetlist = new ArrayList>(); + this.lib.put(key, symsetlist); + } + symsetlist.add(synonyms); + } + } + } catch (Throwable e) { + log.warn("cannot read stemming file " + f, e); + } + } + } + + /** + * for a given word, return a list of synonym words + * @param word + * @return a list of synonyms bot without the requested word + */ + public Set getSynonyms(String word) { + word = word.toLowerCase(); + if (word.length() < 2) return null; + String key = word.substring(0, 2); + List> symsetlist = this.lib.get(key); + if (symsetlist == null) return null; + for (Set symset: symsetlist) { + if (symset.contains(word)) { + // create a new set containing all but the one word + Set returnSet = new HashSet(); + for (String synonym: symset) { + if (synonym.equals(word)) continue; + returnSet.add(synonym); + } + return returnSet; + } + } + return null; + } + +}