You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
260 lines
8.1 KiB
260 lines
8.1 KiB
package de.anomic.tools;
|
|
|
|
import java.util.Collections;
|
|
import java.util.Comparator;
|
|
import java.util.HashSet;
|
|
import java.util.Set;
|
|
import java.util.TreeSet;
|
|
import java.util.concurrent.LinkedBlockingQueue;
|
|
|
|
import de.anomic.document.Word;
|
|
import de.anomic.kelondro.text.IndexCell;
|
|
import de.anomic.kelondro.text.referencePrototype.WordReference;
|
|
import de.anomic.yacy.logging.Log;
|
|
|
|
/**
|
|
* People make mistakes when they type words.
|
|
* The most common mistakes are the four categories listed below:
|
|
* <ol>
|
|
* <li>Changing one letter: bat / cat;</li>
|
|
* <li>Adding one letter: bat / boat;</li>
|
|
* <li>Deleting one letter: frog / fog; or</li>
|
|
* <li>Reversing two consecutive letters: two / tow.</li>
|
|
* </ol>
|
|
* DidYouMean provides producer threads, that feed a blocking queue with word variations according to
|
|
* the above mentioned four categories. Consumer threads check then the generated word variations against a term index.
|
|
* Only words contained in the term index are return by the getSuggestion method.<p/>
|
|
* @author apfelmaennchen
|
|
*/
|
|
public class DidYouMean {
|
|
|
|
protected static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
|
|
'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'};
|
|
|
|
public static final int availableCPU = Runtime.getRuntime().availableProcessors();
|
|
final LinkedBlockingQueue<String> queue = new LinkedBlockingQueue<String>();
|
|
|
|
protected final Set<String> set;
|
|
protected final IndexCell<WordReference> index;
|
|
protected String word;
|
|
protected int len;
|
|
|
|
|
|
/**
|
|
* @param index a termIndex - most likely retrieved from a switchboard object.
|
|
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
|
|
*/
|
|
public DidYouMean(final IndexCell<WordReference> index, boolean sort) {
|
|
if(sort)
|
|
this.set = Collections.synchronizedSortedSet(new TreeSet<String>(new wordSizeComparator()));
|
|
else
|
|
this.set = Collections.synchronizedSet(new HashSet<String>());
|
|
this.word = "";
|
|
this.len = 0;
|
|
this.index = index;
|
|
}
|
|
|
|
/**
|
|
* @param index a termIndex - most likely retrieved from a switchboard object.
|
|
*/
|
|
public DidYouMean(final IndexCell<WordReference> index) {
|
|
this(index, false);
|
|
}
|
|
|
|
/**
|
|
* This method triggers the 4 producer and 8 consumer threads of DidYouMean.
|
|
* <p/><b>Note:</b> the default timeout is 500ms
|
|
* @param word a String with a single word
|
|
* @return a Set<String> with word variations contained in index.
|
|
*/
|
|
public Set<String> getSuggestion(final String word) {
|
|
return getSuggestion(word, 500);
|
|
}
|
|
|
|
/**
|
|
* This method triggers the 4 producer and 8 consumer threads of the DidYouMean object.
|
|
* @param word a String with a single word
|
|
* @param timeout execution time in ms.
|
|
* @return a Set<String> with word variations contained in term index.
|
|
*/
|
|
public Set<String> getSuggestion(final String word, long timeout) {
|
|
long startTime = System.currentTimeMillis();
|
|
this.word = word.toLowerCase();
|
|
this.len = word.length();
|
|
|
|
// create producers
|
|
// the intention of the 4 producers is to mix results, as there
|
|
// is currently no default sorting or ranking due to the i/o performance of index.count()
|
|
Thread[] producers = new Thread[4];
|
|
producers[0] = new ChangingOneLetter();
|
|
producers[1] = new AddingOneLetter();
|
|
producers[2] = new DeletingOneLetter();
|
|
producers[3] = new ReversingTwoConsecutiveLetters();
|
|
|
|
// start producers
|
|
for (int i=0; i<producers.length; i++) {
|
|
producers[i].start();
|
|
}
|
|
|
|
// create and start consumers threads
|
|
Thread[] consumers = new Thread[availableCPU];
|
|
for (int i=0; i<consumers.length; i++) {
|
|
consumers[i] = new Consumer();
|
|
consumers[i].start();
|
|
}
|
|
|
|
// check if timeout has been reached
|
|
boolean cont = false;
|
|
while(((System.currentTimeMillis()-startTime) < timeout)) {
|
|
// checks if queue is already empty
|
|
if(queue.size()==0) {
|
|
// check if at least one producers is still running and potentially filling the queue
|
|
for (int i=0; i<producers.length; i++) {
|
|
if(producers[i].isAlive())
|
|
cont = true;
|
|
}
|
|
// as the queue is empty and no producer is running we can break the timeout-loop
|
|
if(!cont) break;
|
|
}
|
|
}
|
|
|
|
// interrupt all consumer threads
|
|
for (int i=0; i<consumers.length; i++) {
|
|
consumers[i].interrupt();
|
|
}
|
|
|
|
// interrupt all remaining producer threads
|
|
for (int i=0; i<producers.length; i++) {
|
|
producers[i].interrupt();
|
|
}
|
|
|
|
this.set.remove(word.toLowerCase());
|
|
Log.logInfo("DidYouMean", "found "+this.set.size()+" terms; execution time: "
|
|
+(System.currentTimeMillis()-startTime)+"ms"+ " - remaining queue size: "+queue.size());
|
|
|
|
return this.set;
|
|
|
|
}
|
|
/**
|
|
* DidYouMean's producer thread that changes one letter (e.g. bat/cat) for a given term
|
|
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
|
|
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
|
|
*/
|
|
public class ChangingOneLetter extends Thread {
|
|
|
|
public void run() {
|
|
String s;
|
|
for(int i=0; i<len; i++) {
|
|
for(int j=0; j<alphabet.length; j++) {
|
|
s = word.substring(0, i) + alphabet[j] + word.substring(i+1);
|
|
try {
|
|
queue.put(s);
|
|
} catch (InterruptedException e) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term
|
|
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
|
|
* <b>Note:</b> the loop runs (len) tests.
|
|
*/
|
|
protected class DeletingOneLetter extends Thread {
|
|
|
|
public void run() {
|
|
String s;
|
|
for(int i=0; i<len;i++) {
|
|
s = word.substring(0, i) + word.substring(i+1);
|
|
try {
|
|
queue.put(s);
|
|
} catch (InterruptedException e) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term
|
|
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
|
|
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
|
|
*/
|
|
protected class AddingOneLetter extends Thread {
|
|
|
|
public void run() {
|
|
String s;
|
|
for(int i=0; i<=len;i++) {
|
|
for(int j=0; j<alphabet.length; j++) {
|
|
s = word.substring(0, i) + alphabet[j] + word.substring(i);
|
|
try {
|
|
queue.put(s);
|
|
} catch (InterruptedException e) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term
|
|
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
|
|
* <b>Note:</b> the loop runs (len-1) tests.
|
|
*/
|
|
protected class ReversingTwoConsecutiveLetters extends Thread {
|
|
|
|
public void run() {
|
|
String s;
|
|
for(int i=0; i<len-1; i++) {
|
|
s = word.substring(0,i)+word.charAt(i+1)+word.charAt(i)+word.substring(i+2);
|
|
try {
|
|
queue.put(s);
|
|
} catch (InterruptedException e) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* DidYouMean's consumer thread takes a String object (term) from the blocking queue
|
|
* and checks if it is contained in YaCy's RWI index. The thread recognizes "\n" as poison pill!<p/>
|
|
* <b>Note:</b> this causes no or moderate i/o as it uses the efficient index.has() method.
|
|
*/
|
|
class Consumer extends Thread {
|
|
|
|
public void run() {
|
|
try {
|
|
while(true) {
|
|
String s = queue.take();
|
|
if(s.equals("\n"))
|
|
this.interrupt();
|
|
else
|
|
consume(s);
|
|
}
|
|
} catch (InterruptedException e) {
|
|
return;
|
|
}
|
|
}
|
|
void consume(String s) {
|
|
if (index.has(Word.word2hash(s))) {
|
|
set.add(s);
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* wordSizeComparator is used by DidYouMean to order terms by index.count()<p/>
|
|
* <b>Warning:</b> this causes heavy i/o
|
|
*/
|
|
protected class wordSizeComparator implements Comparator<String> {
|
|
public int compare(final String o1, final String o2) {
|
|
final Integer i1 = index.count(Word.word2hash(o1));
|
|
final Integer i2 = index.count(Word.word2hash(o2));
|
|
return i2.compareTo(i1);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|