// DidYouMeanLibrary.java // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 01.10.2009 on http://yacy.net // // This is a part of YaCy // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.data; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashSet; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.zip.GZIPInputStream; import net.yacy.kelondro.logging.Log; /** * provide a completion library for the did-you-mean class * */ public class DidYouMeanLibrary { private final File dictionaryPath; private TreeSet dict, tcid; /** * create a new dictionary * This loads all files that ends with '.words' * The files must have one word per line * Comment lines may be given and are encoded as line starting with '#' * @param dictionaryPath path to a directory with library files */ public DidYouMeanLibrary(final File dictionaryPath) { this.dictionaryPath = dictionaryPath; reload(); } /** * scan the input directory and load all dictionaries (again) */ public void reload() { this.dict = new TreeSet(); this.tcid = new TreeSet(); if (dictionaryPath == null || !dictionaryPath.exists()) return; final String[] files = dictionaryPath.list(); for (final String f: files) { if (f.endsWith(".words")) try { inputStream(new File(dictionaryPath, f)); } catch (IOException e) { Log.logException(e); } } } private void inputStream(final File file) throws IOException { InputStream is = new FileInputStream(file); if (file.getName().endsWith(".gz")) { is = new GZIPInputStream(is); } final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); String l; try { while ((l = reader.readLine()) != null) { if (l.length() == 0 || l.charAt(0) == '#') continue; l = l.trim().toLowerCase(); this.dict.add(l); this.tcid.add(reverse(l)); } } catch (IOException e) { // finish } } private static String reverse(final String s) { StringBuilder sb = new StringBuilder(s.length()); for (int i = s.length() - 1; i >= 0; i--) { sb.append(s.charAt(i)); } return sb.toString(); } /** * read the dictionary and construct a set of recommendations to a given string * @param s input value that is used to match recommendations * @return set that contains all words that start or end with the input value */ public Set recommend(final String s) { final Set ret = new HashSet(); String string = s.trim().toLowerCase(); SortedSet t = this.dict.tailSet(string); for (final String r: t) { if (r.startsWith(string)) ret.add(r); else break; } string = reverse(string); t = this.tcid.tailSet(string); for (final String r: t) { if (r.startsWith(string)) ret.add(reverse(r)); else break; } return ret; } /** * check if the library contains the given word * @param s the given word * @return true if the library contains the word */ public boolean contains(final String s) { return this.dict.contains(s.trim().toLowerCase()); // if the above case is true then it is also true for this.tcid and vice versa // that means it does not need to be tested as well } /** * check if the library supports the given word * A word is supported, if the library contains a word * that starts or ends with the given word * @param s the given word * @return true if the library supports the word */ public boolean supports(final String s) { String string = s.trim().toLowerCase(); SortedSet t = this.dict.tailSet(string); for (final String r: t) { if (string.startsWith(r)) return true; else break; } string = reverse(string); t = this.tcid.tailSet(string); for (final String r: t) { if (string.startsWith(r)) return true; else break; } return false; } /** * the size of the dictionay * @return the number of words in the dictionary */ public int size() { return this.dict.size(); } /** * a property that is used during the construction of recommendation: * if the dictionary is too small, then the non-existence of constructed words * is not relevant for the construction of artificially constructed words * If this property returns true, all other words must be in the dictionary * @param minimumWords * @return */ public boolean isRelevant(final int minimumWords) { return this.dict.size() >= minimumWords; } }