diff --git a/source/de/anomic/tools/DidYouMean.java b/source/de/anomic/tools/DidYouMean.java index cab32cf41..ab7e984e5 100644 --- a/source/de/anomic/tools/DidYouMean.java +++ b/source/de/anomic/tools/DidYouMean.java @@ -1,8 +1,10 @@ package de.anomic.tools; import java.util.Collections; +import java.util.Comparator; import java.util.HashSet; import java.util.Set; +import java.util.TreeSet; import java.util.concurrent.LinkedBlockingQueue; import de.anomic.kelondro.text.IndexCell; @@ -10,19 +12,26 @@ import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.plasma.parser.Word; import de.anomic.yacy.logging.Log; -// People make mistakes when they type words. -// The most common mistakes are the four categories listed below: -// (1) Changing one letter: bat / cat; -// (2) Adding one letter: bat / boat; -// (3) Deleting one letter: frog / fog; or -// (4) Reversing two consecutive letters: two / tow. - +/** + * People make mistakes when they type words. + * The most common mistakes are the four categories listed below: + *
    + *
  1. Changing one letter: bat / cat;
  2. + *
  3. Adding one letter: bat / boat;
  4. + *
  5. Deleting one letter: frog / fog; or
  6. + *
  7. Reversing two consecutive letters: two / tow.
  8. + *
+ * DidYouMean provides producer threads, that feed a blocking queue with word variations according to + * the above mentioned four categories. Consumer threads check then the generated word variations against a term index. + * Only words contained in the term index are return by the getSuggestion method.

+ * @author apfelmaennchen + */ public class DidYouMean { private static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p', 'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'}; - private static final long TIMEOUT = 500; + public static final int availableCPU = Runtime.getRuntime().availableProcessors(); final LinkedBlockingQueue queue = new LinkedBlockingQueue(); private final Set set; @@ -30,20 +39,52 @@ public class DidYouMean { private String word; private int len; - public DidYouMean(final IndexCell index) { - // this.set = Collections.synchronizedSortedSet(new TreeSet(new wordSizeComparator())); - this.set = Collections.synchronizedSet(new HashSet()); + + /** + * @param index a termIndex - most likely retrieved from a switchboard object. + * @param sort true/false - sorts the resulting TreeSet by index.count(); Warning: this causes heavy i/o. + */ + public DidYouMean(final IndexCell index, boolean sort) { + if(sort) + this.set = Collections.synchronizedSortedSet(new TreeSet(new wordSizeComparator())); + else + this.set = Collections.synchronizedSet(new HashSet()); this.word = ""; this.len = 0; this.index = index; } + /** + * @param index a termIndex - most likely retrieved from a switchboard object. + */ + public DidYouMean(final IndexCell index) { + this(index, false); + } + + /** + * This method triggers the 4 producer and 8 consumer threads of DidYouMean. + *

Note: the default timeout is 500ms + * @param word a String with a single word + * @return a Set<String> with word variations contained in index. + */ public Set getSuggestion(final String word) { + return getSuggestion(word, 500); + } + + /** + * This method triggers the 4 producer and 8 consumer threads of the DidYouMean object. + * @param word a String with a single word + * @param timeout execution time in ms. + * @return a Set<String> with word variations contained in term index. + */ + public Set getSuggestion(final String word, long timeout) { long startTime = System.currentTimeMillis(); this.word = word.toLowerCase(); this.len = word.length(); // create producers + // the intention of the 4 producers is to mix results, as there + // is currently no default sorting or ranking due to the i/o performance of index.count() Thread[] producers = new Thread[4]; producers[0] = new ChangingOneLetter(); producers[1] = new AddingOneLetter(); @@ -55,8 +96,8 @@ public class DidYouMean { producers[i].start(); } - // create and start 8 consumers threads - Thread[] consumers = new Thread[8]; + // create and start consumers threads + Thread[] consumers = new Thread[availableCPU]; for (int i=0; i + * Note: the loop runs (alphabet.length * len) tests. + */ private class ChangingOneLetter extends Thread { - // tests: alphabet.length * len public void run() { String s; for(int i=0; i + * Note: the loop runs (len) tests. + */ private class DeletingOneLetter extends Thread { - // tests: len public void run() { String s; for(int i=0; i + * Note: the loop runs (alphabet.length * len) tests. + */ private class AddingOneLetter extends Thread { - // tests: alphabet.length * len public void run() { String s; for(int i=0; i<=len;i++) { @@ -153,10 +196,13 @@ public class DidYouMean { } } } - + /** + * DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term + * and puts it on the blocking queue, to be 'consumed' by a consumer thread.

+ * Note: the loop runs (len-1) tests. + */ private class ReversingTwoConsecutiveLetters extends Thread { - // tests: (len - 1) public void run() { String s; for(int i=0; i + * Note: this causes no or moderate i/o as it uses the efficient index.has() method. + */ class Consumer extends Thread { public void run() { @@ -190,9 +240,11 @@ public class DidYouMean { set.add(s); } } - } - - /* + } + /** + * wordSizeComparator is used by DidYouMean to order terms by index.count()

+ * Warning: this causes heavy i/o + */ private class wordSizeComparator implements Comparator { public int compare(final String o1, final String o2) { final Integer i1 = index.count(Word.word2hash(o1)); @@ -200,7 +252,7 @@ public class DidYouMean { return i2.compareTo(i1); } } - */ + }