Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

pull/1/head
Michael Peter Christen 12 years ago
commit 41ab2a2279

@ -0,0 +1,112 @@
/**
* Stemming
* Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 01.10.2012 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.language.synonyms;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import org.apache.log4j.Logger;
import net.yacy.cora.storage.Files;
/**
* Stemming library: reads stemming files and creates a mapping from words to synonyms
* Stemming files must have a list of synonym words in each line of the input file.
* The words within one line must be separated by ','. Lines starting with '#' are
* comment files and are ignored. Each line can (but does not need to) have a '{'
* at the beginning of the line and '}' at the end (which would be the GSA format).
*/
public class SynonymLibrary {
Logger log = Logger.getLogger(SynonymLibrary.class);
private Map<String, List<Set<String>>> lib;
public SynonymLibrary(final File path) {
this.lib = new HashMap<String, List<Set<String>>>();
if (!path.exists() || !path.isDirectory()) return;
final String[] files = path.list();
for (final String f: files) {
File ff = new File(path, f);
String line;
try {
BlockingQueue<String> list = Files.concurentLineReader(ff, 1000);
while ((line = list.take()) != Files.POISON_LINE) {
line = line.trim();
if (line.length() == 0 || line.charAt(0) == '#') continue;
if (line.charAt(line.length() - 1) == '}') line = line.substring(0, line.length() - 1);
if (line.charAt(0) == '{') line = line.substring(1);
String[] words = line.split(",");
Set<String> synonyms = new HashSet<String>();
Set<String> keys = new HashSet<String>();
for (String s: words) {
s = s.trim();
if (s.length() < 2) continue;
String t = s.toLowerCase();
synonyms.add(t);
keys.add(t.substring(0, 2));
}
for (String key: keys) {
List<Set<String>> symsetlist = this.lib.get(key);
if (symsetlist == null) {
symsetlist = new ArrayList<Set<String>>();
this.lib.put(key, symsetlist);
}
symsetlist.add(synonyms);
}
}
} catch (Throwable e) {
log.warn("cannot read stemming file " + f, e);
}
}
}
/**
* for a given word, return a list of synonym words
* @param word
* @return a list of synonyms bot without the requested word
*/
public Set<String> getSynonyms(String word) {
word = word.toLowerCase();
if (word.length() < 2) return null;
String key = word.substring(0, 2);
List<Set<String>> symsetlist = this.lib.get(key);
if (symsetlist == null) return null;
for (Set<String> symset: symsetlist) {
if (symset.contains(word)) {
// create a new set containing all but the one word
Set<String> returnSet = new HashSet<String>();
for (String synonym: symset) {
if (synonym.equals(word)) continue;
returnSet.add(synonym);
}
return returnSet;
}
}
return null;
}
}
Loading…
Cancel
Save