package de.anomic.tools;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue;
import de.anomic.document.Word;
import de.anomic.kelondro.text.IndexCell;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.yacy.logging.Log;
/**
* People make mistakes when they type words.
* The most common mistakes are the four categories listed below:
*
* - Changing one letter: bat / cat;
* - Adding one letter: bat / boat;
* - Deleting one letter: frog / fog; or
* - Reversing two consecutive letters: two / tow.
*
* DidYouMean provides producer threads, that feed a blocking queue with word variations according to
* the above mentioned four categories. Consumer threads check then the generated word variations against a term index.
* Only words contained in the term index are return by the getSuggestion method.
* @author apfelmaennchen
*/
public class DidYouMean {
protected static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'};
public static final int availableCPU = Runtime.getRuntime().availableProcessors();
final LinkedBlockingQueue queue = new LinkedBlockingQueue();
protected final Set set;
protected final IndexCell index;
protected String word;
protected int len;
/**
* @param index a termIndex - most likely retrieved from a switchboard object.
* @param sort true/false - sorts the resulting TreeSet by index.count(); Warning: this causes heavy i/o.
*/
public DidYouMean(final IndexCell index, boolean sort) {
if(sort)
this.set = Collections.synchronizedSortedSet(new TreeSet(new wordSizeComparator()));
else
this.set = Collections.synchronizedSet(new HashSet());
this.word = "";
this.len = 0;
this.index = index;
}
/**
* @param index a termIndex - most likely retrieved from a switchboard object.
*/
public DidYouMean(final IndexCell index) {
this(index, false);
}
/**
* This method triggers the 4 producer and 8 consumer threads of DidYouMean.
* Note: the default timeout is 500ms
* @param word a String with a single word
* @return a Set<String> with word variations contained in index.
*/
public Set getSuggestion(final String word) {
return getSuggestion(word, 500);
}
/**
* This method triggers the 4 producer and 8 consumer threads of the DidYouMean object.
* @param word a String with a single word
* @param timeout execution time in ms.
* @return a Set<String> with word variations contained in term index.
*/
public Set getSuggestion(final String word, long timeout) {
long startTime = System.currentTimeMillis();
this.word = word.toLowerCase();
this.len = word.length();
// create producers
// the intention of the 4 producers is to mix results, as there
// is currently no default sorting or ranking due to the i/o performance of index.count()
Thread[] producers = new Thread[4];
producers[0] = new ChangingOneLetter();
producers[1] = new AddingOneLetter();
producers[2] = new DeletingOneLetter();
producers[3] = new ReversingTwoConsecutiveLetters();
// start producers
for (int i=0; i
* Note: the loop runs (alphabet.length * len) tests.
*/
public class ChangingOneLetter extends Thread {
public void run() {
String s;
for(int i=0; i
* Note: the loop runs (len) tests.
*/
protected class DeletingOneLetter extends Thread {
public void run() {
String s;
for(int i=0; i
* Note: the loop runs (alphabet.length * len) tests.
*/
protected class AddingOneLetter extends Thread {
public void run() {
String s;
for(int i=0; i<=len;i++) {
for(int j=0; j
* Note: the loop runs (len-1) tests.
*/
protected class ReversingTwoConsecutiveLetters extends Thread {
public void run() {
String s;
for(int i=0; i
* Note: this causes no or moderate i/o as it uses the efficient index.has() method.
*/
class Consumer extends Thread {
public void run() {
try {
while(true) {
String s = queue.take();
if(s.equals("\n"))
this.interrupt();
else
consume(s);
}
} catch (InterruptedException e) {
return;
}
}
void consume(String s) {
if (index.has(Word.word2hash(s))) {
set.add(s);
}
}
}
/**
* wordSizeComparator is used by DidYouMean to order terms by index.count()
* Warning: this causes heavy i/o
*/
protected class wordSizeComparator implements Comparator {
public int compare(final String o1, final String o2) {
final Integer i1 = index.count(Word.word2hash(o1));
final Integer i2 = index.count(Word.word2hash(o2));
return i2.compareTo(i1);
}
}
}