package de.anomic.tools; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.LinkedBlockingQueue; import de.anomic.document.Word; import de.anomic.kelondro.text.IndexCell; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.yacy.logging.Log; /** * People make mistakes when they type words. * The most common mistakes are the four categories listed below: *
    *
  1. Changing one letter: bat / cat;
  2. *
  3. Adding one letter: bat / boat;
  4. *
  5. Deleting one letter: frog / fog; or
  6. *
  7. Reversing two consecutive letters: two / tow.
  8. *
* DidYouMean provides producer threads, that feed a blocking queue with word variations according to * the above mentioned four categories. Consumer threads check then the generated word variations against a term index. * Only words contained in the term index are return by the getSuggestion method.

* @author apfelmaennchen */ public class DidYouMean { protected static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p', 'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'}; public static final int availableCPU = Runtime.getRuntime().availableProcessors(); final LinkedBlockingQueue queue = new LinkedBlockingQueue(); protected final Set set; protected final IndexCell index; protected String word; protected int len; /** * @param index a termIndex - most likely retrieved from a switchboard object. * @param sort true/false - sorts the resulting TreeSet by index.count(); Warning: this causes heavy i/o. */ public DidYouMean(final IndexCell index, boolean sort) { if(sort) this.set = Collections.synchronizedSortedSet(new TreeSet(new wordSizeComparator())); else this.set = Collections.synchronizedSet(new HashSet()); this.word = ""; this.len = 0; this.index = index; } /** * @param index a termIndex - most likely retrieved from a switchboard object. */ public DidYouMean(final IndexCell index) { this(index, false); } /** * This method triggers the 4 producer and 8 consumer threads of DidYouMean. *

Note: the default timeout is 500ms * @param word a String with a single word * @return a Set<String> with word variations contained in index. */ public Set getSuggestion(final String word) { return getSuggestion(word, 500); } /** * This method triggers the 4 producer and 8 consumer threads of the DidYouMean object. * @param word a String with a single word * @param timeout execution time in ms. * @return a Set<String> with word variations contained in term index. */ public Set getSuggestion(final String word, long timeout) { long startTime = System.currentTimeMillis(); this.word = word.toLowerCase(); this.len = word.length(); // create producers // the intention of the 4 producers is to mix results, as there // is currently no default sorting or ranking due to the i/o performance of index.count() Thread[] producers = new Thread[4]; producers[0] = new ChangingOneLetter(); producers[1] = new AddingOneLetter(); producers[2] = new DeletingOneLetter(); producers[3] = new ReversingTwoConsecutiveLetters(); // start producers for (int i=0; i * Note: the loop runs (alphabet.length * len) tests. */ public class ChangingOneLetter extends Thread { public void run() { String s; for(int i=0; i * Note: the loop runs (len) tests. */ protected class DeletingOneLetter extends Thread { public void run() { String s; for(int i=0; i * Note: the loop runs (alphabet.length * len) tests. */ protected class AddingOneLetter extends Thread { public void run() { String s; for(int i=0; i<=len;i++) { for(int j=0; j * Note: the loop runs (len-1) tests. */ protected class ReversingTwoConsecutiveLetters extends Thread { public void run() { String s; for(int i=0; i * Note: this causes no or moderate i/o as it uses the efficient index.has() method. */ class Consumer extends Thread { public void run() { try { while(true) { String s = queue.take(); if(s.equals("\n")) this.interrupt(); else consume(s); } } catch (InterruptedException e) { return; } } void consume(String s) { if (index.has(Word.word2hash(s))) { set.add(s); } } } /** * wordSizeComparator is used by DidYouMean to order terms by index.count()

* Warning: this causes heavy i/o */ protected class wordSizeComparator implements Comparator { public int compare(final String o1, final String o2) { final Integer i1 = index.count(Word.word2hash(o1)); final Integer i2 = index.count(Word.word2hash(o2)); return i2.compareTo(i1); } } }