DidYouMean:

- limit the number of consumer threads to available CPUs
- added some javadoc

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6144 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
apfelmaennchen 16 years ago
parent 7eb3bff5b3
commit a10c8022d1

@ -1,8 +1,10 @@
package de.anomic.tools; package de.anomic.tools;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import de.anomic.kelondro.text.IndexCell; import de.anomic.kelondro.text.IndexCell;
@ -10,19 +12,26 @@ import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.plasma.parser.Word; import de.anomic.plasma.parser.Word;
import de.anomic.yacy.logging.Log; import de.anomic.yacy.logging.Log;
// People make mistakes when they type words. /**
// The most common mistakes are the four categories listed below: * People make mistakes when they type words.
// (1) Changing one letter: bat / cat; * The most common mistakes are the four categories listed below:
// (2) Adding one letter: bat / boat; * <ol>
// (3) Deleting one letter: frog / fog; or * <li>Changing one letter: bat / cat;</li>
// (4) Reversing two consecutive letters: two / tow. * <li>Adding one letter: bat / boat;</li>
* <li>Deleting one letter: frog / fog; or</li>
* <li>Reversing two consecutive letters: two / tow.</li>
* </ol>
* DidYouMean provides producer threads, that feed a blocking queue with word variations according to
* the above mentioned four categories. Consumer threads check then the generated word variations against a term index.
* Only words contained in the term index are return by the getSuggestion method.<p/>
* @author apfelmaennchen
*/
public class DidYouMean { public class DidYouMean {
private static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p', private static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'}; 'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'};
private static final long TIMEOUT = 500;
public static final int availableCPU = Runtime.getRuntime().availableProcessors();
final LinkedBlockingQueue<String> queue = new LinkedBlockingQueue<String>(); final LinkedBlockingQueue<String> queue = new LinkedBlockingQueue<String>();
private final Set<String> set; private final Set<String> set;
@ -30,20 +39,52 @@ public class DidYouMean {
private String word; private String word;
private int len; private int len;
public DidYouMean(final IndexCell<WordReference> index) {
// this.set = Collections.synchronizedSortedSet(new TreeSet<String>(new wordSizeComparator())); /**
this.set = Collections.synchronizedSet(new HashSet<String>()); * @param index a termIndex - most likely retrieved from a switchboard object.
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/
public DidYouMean(final IndexCell<WordReference> index, boolean sort) {
if(sort)
this.set = Collections.synchronizedSortedSet(new TreeSet<String>(new wordSizeComparator()));
else
this.set = Collections.synchronizedSet(new HashSet<String>());
this.word = ""; this.word = "";
this.len = 0; this.len = 0;
this.index = index; this.index = index;
} }
/**
* @param index a termIndex - most likely retrieved from a switchboard object.
*/
public DidYouMean(final IndexCell<WordReference> index) {
this(index, false);
}
/**
* This method triggers the 4 producer and 8 consumer threads of DidYouMean.
* <p/><b>Note:</b> the default timeout is 500ms
* @param word a String with a single word
* @return a Set&lt;String&gt; with word variations contained in index.
*/
public Set<String> getSuggestion(final String word) { public Set<String> getSuggestion(final String word) {
return getSuggestion(word, 500);
}
/**
* This method triggers the 4 producer and 8 consumer threads of the DidYouMean object.
* @param word a String with a single word
* @param timeout execution time in ms.
* @return a Set&lt;String&gt; with word variations contained in term index.
*/
public Set<String> getSuggestion(final String word, long timeout) {
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
this.word = word.toLowerCase(); this.word = word.toLowerCase();
this.len = word.length(); this.len = word.length();
// create producers // create producers
// the intention of the 4 producers is to mix results, as there
// is currently no default sorting or ranking due to the i/o performance of index.count()
Thread[] producers = new Thread[4]; Thread[] producers = new Thread[4];
producers[0] = new ChangingOneLetter(); producers[0] = new ChangingOneLetter();
producers[1] = new AddingOneLetter(); producers[1] = new AddingOneLetter();
@ -55,8 +96,8 @@ public class DidYouMean {
producers[i].start(); producers[i].start();
} }
// create and start 8 consumers threads // create and start consumers threads
Thread[] consumers = new Thread[8]; Thread[] consumers = new Thread[availableCPU];
for (int i=0; i<consumers.length; i++) { for (int i=0; i<consumers.length; i++) {
consumers[i] = new Consumer(); consumers[i] = new Consumer();
consumers[i].start(); consumers[i].start();
@ -64,32 +105,25 @@ public class DidYouMean {
// check if timeout has been reached // check if timeout has been reached
boolean cont = false; boolean cont = false;
while(((System.currentTimeMillis()-startTime) < TIMEOUT)) { while(((System.currentTimeMillis()-startTime) < timeout)) {
// checks if queue is already empty
if(queue.size()==0) { if(queue.size()==0) {
// check if at least one producers is still running // check if at least one producers is still running and potentially filling the queue
for (int i=0; i<producers.length; i++) { for (int i=0; i<producers.length; i++) {
if(producers[i].isAlive()) if(producers[i].isAlive())
cont = true; cont = true;
} }
// as the queue is empty and no producer is running we can break the timeout-loop
if(!cont) break; if(!cont) break;
} }
} }
// interupt all consumer threads // interrupt all consumer threads
for (int i=0; i<consumers.length; i++) { for (int i=0; i<consumers.length; i++) {
consumers[i].interrupt(); consumers[i].interrupt();
} }
/* put "poison pill" for each consumer thread // interrupt all remaining producer threads
for (int i=0; i<consumers.length; i++) {
try {
queue.put("\n");
} catch (InterruptedException e) {
}
}
*/
// interupt all remaining producer threads
for (int i=0; i<producers.length; i++) { for (int i=0; i<producers.length; i++) {
producers[i].interrupt(); producers[i].interrupt();
} }
@ -101,10 +135,13 @@ public class DidYouMean {
return this.set; return this.set;
} }
/**
* DidYouMean's producer thread that changes one letter (e.g. bat/cat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
private class ChangingOneLetter extends Thread { private class ChangingOneLetter extends Thread {
// tests: alphabet.length * len
public void run() { public void run() {
String s; String s;
for(int i=0; i<len; i++) { for(int i=0; i<len; i++) {
@ -119,10 +156,13 @@ public class DidYouMean {
} }
} }
} }
/**
* DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len) tests.
*/
private class DeletingOneLetter extends Thread { private class DeletingOneLetter extends Thread {
// tests: len
public void run() { public void run() {
String s; String s;
for(int i=0; i<len;i++) { for(int i=0; i<len;i++) {
@ -135,10 +175,13 @@ public class DidYouMean {
} }
} }
} }
/**
* DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
private class AddingOneLetter extends Thread { private class AddingOneLetter extends Thread {
// tests: alphabet.length * len
public void run() { public void run() {
String s; String s;
for(int i=0; i<=len;i++) { for(int i=0; i<=len;i++) {
@ -153,10 +196,13 @@ public class DidYouMean {
} }
} }
} }
/**
* DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len-1) tests.
*/
private class ReversingTwoConsecutiveLetters extends Thread { private class ReversingTwoConsecutiveLetters extends Thread {
// tests: (len - 1)
public void run() { public void run() {
String s; String s;
for(int i=0; i<len-1; i++) { for(int i=0; i<len-1; i++) {
@ -169,7 +215,11 @@ public class DidYouMean {
} }
} }
} }
/**
* DidYouMean's consumer thread takes a String object (term) from the blocking queue
* and checks if it is contained in YaCy's RWI index. The thread recognizes "\n" as poison pill!<p/>
* <b>Note:</b> this causes no or moderate i/o as it uses the efficient index.has() method.
*/
class Consumer extends Thread { class Consumer extends Thread {
public void run() { public void run() {
@ -190,9 +240,11 @@ public class DidYouMean {
set.add(s); set.add(s);
} }
} }
} }
/**
/* * wordSizeComparator is used by DidYouMean to order terms by index.count()<p/>
* <b>Warning:</b> this causes heavy i/o
*/
private class wordSizeComparator implements Comparator<String> { private class wordSizeComparator implements Comparator<String> {
public int compare(final String o1, final String o2) { public int compare(final String o1, final String o2) {
final Integer i1 = index.count(Word.word2hash(o1)); final Integer i1 = index.count(Word.word2hash(o1));
@ -200,7 +252,7 @@ public class DidYouMean {
return i2.compareTo(i1); return i2.compareTo(i1);
} }
} }
*/
} }

Loading…
Cancel
Save