From 1762a7bcd678ebf707738db80e0cb4f9200cb9c5 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 1 Sep 2009 13:04:35 +0000 Subject: [PATCH] - moved DidYouMean to the data package - added a DidYouMeanLibrary class that shall support the did you mean function with additional word lists git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6281 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacysearch.java | 2 +- .../de/anomic/{tools => data}/DidYouMean.java | 2 +- source/de/anomic/data/DidYouMeanLibrary.java | 170 ++++++++++++++++++ 3 files changed, 172 insertions(+), 2 deletions(-) rename source/de/anomic/{tools => data}/DidYouMean.java (99%) create mode 100644 source/de/anomic/data/DidYouMeanLibrary.java diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 8313d0eab..43201a0d7 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -34,6 +34,7 @@ import java.util.TreeSet; import de.anomic.content.RSSMessage; import de.anomic.crawler.retrieval.LoaderDispatcher; +import de.anomic.data.DidYouMean; import de.anomic.document.Condenser; import de.anomic.document.Word; import de.anomic.document.Document; @@ -55,7 +56,6 @@ import de.anomic.server.serverDomains; import de.anomic.server.serverObjects; import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; -import de.anomic.tools.DidYouMean; import de.anomic.tools.iso639; import de.anomic.tools.Formatter; import de.anomic.yacy.yacyNewsPool; diff --git a/source/de/anomic/tools/DidYouMean.java b/source/de/anomic/data/DidYouMean.java similarity index 99% rename from source/de/anomic/tools/DidYouMean.java rename to source/de/anomic/data/DidYouMean.java index 40b68cf8a..c2513ea8b 100644 --- a/source/de/anomic/tools/DidYouMean.java +++ b/source/de/anomic/data/DidYouMean.java @@ -1,4 +1,4 @@ -package de.anomic.tools; +package de.anomic.data; import java.util.Collections; import java.util.Comparator; diff --git a/source/de/anomic/data/DidYouMeanLibrary.java b/source/de/anomic/data/DidYouMeanLibrary.java new file mode 100644 index 000000000..bac47a271 --- /dev/null +++ b/source/de/anomic/data/DidYouMeanLibrary.java @@ -0,0 +1,170 @@ +// DidYouMeanLibrary.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 01.10.2009 on http://yacy.net +// +// This is a part of YaCy +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; + +/** + * provide a completion library for the did-you-mean class + * + */ +public class DidYouMeanLibrary { + + private File dictionaryPath; + private TreeSet dict, tcid; + + /** + * create a new dictionary + * This loads all files that ends with '.words' + * The files must have one word per line + * Comment lines may be given and are encoded as line starting with '#' + * @param dictionaryPath a path to a directory with library files + */ + public DidYouMeanLibrary(File dictionaryPath) { + this.dictionaryPath = dictionaryPath; + reload(); + } + + /** + * scan the input directory and load all dictionaries (again) + */ + public void reload() { + this.dict = new TreeSet(); + this.tcid = new TreeSet(); + String[] files = dictionaryPath.list(); + for (String f: files) { + if (f.endsWith(".words")) try { + importFile(new File(dictionaryPath, f)); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + } + + private void importFile(File f) throws FileNotFoundException { + BufferedReader r = new BufferedReader(new FileReader(f)); + String l; + try { + while ((l = r.readLine()) != null) { + if (l.length() == 0 || l.charAt(0) == '#') continue; + l = l.trim().toLowerCase(); + this.dict.add(l); + this.tcid.add(reverse(l)); + } + } catch (IOException e) { + // finish + } + } + + private static String reverse(String s) { + StringBuilder r = new StringBuilder(s.length()); + for (int i = s.length() - 1; i >= 0; i--) r.append(s.charAt(i)); + return r.toString(); + } + + /** + * read the dictionary and construct a set of recommendations to a given string + * @param s input value that is used to match recommendations + * @return a set that contains all words that start or end with the input value + */ + public Set recommend(String s) { + Set a = new HashSet(); + s = s.trim().toLowerCase(); + SortedSet t = this.dict.tailSet(s); + for (String r: t) { + if (s.startsWith(r)) a.add(r); else break; + } + s = reverse(s); + t = this.tcid.tailSet(s); + for (String r: t) { + if (s.startsWith(r)) a.add(reverse(r)); else break; + } + return a; + } + + /** + * check if the library contains the given word + * @param s the given word + * @return true if the library contains the word + */ + public boolean contains(String s) { + return this.dict.contains(s.trim().toLowerCase()); + // if the above case is true then it is also true for this.tcid and vice versa + // that means it does not need to be tested as well + } + + /** + * check if the library supports the given word + * A word is supported, if the library contains a word + * that starts or ends with the given word + * @param s the given word + * @return true if the library supports the word + */ + public boolean supports(String s) { + s = s.trim().toLowerCase(); + SortedSet t = this.dict.tailSet(s); + for (String r: t) { + if (s.startsWith(r)) return true; else break; + } + s = reverse(s); + t = this.tcid.tailSet(s); + for (String r: t) { + if (s.startsWith(r)) return true; else break; + } + return false; + } + + /** + * the size of the dictionay + * @return the number of words in the dictionary + */ + public int size() { + return this.dict.size(); + } + + + /** + * a property that is used during the construction of recommendation: + * if the dictionary is too small, then the non-existence of constructed words + * is not relevant for the construction of artificially constructed words + * If this property returns true, all other words must be in the dictionary + * @param minimumWords + * @return + */ + public boolean isRelevant(int minimumWords) { + return this.dict.size() >= minimumWords; + } + +}