@ -2,7 +2,6 @@ package de.anomic.data;
import java.util.Collections ;
import java.util.Comparator ;
import java.util.HashSet ;
import java.util.Set ;
import java.util.TreeSet ;
import java.util.concurrent.LinkedBlockingQueue ;
@ -30,14 +29,17 @@ public class DidYouMean {
protected static final char [ ] alphabet = { 'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' , 'j' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' ,
'q' , 'r' , 's' , 't' , 'u' , 'v' , 'w' , 'x' , 'y' , 'z' , '\u00e4' , '\u00f6' , '\u00fc' , '\u00df' } ;
private static final String poisonString = "\n" ;
public static final int availableCPU = Runtime . getRuntime ( ) . availableProcessors ( ) ;
final LinkedBlockingQueue < String > queue = new LinkedBlockingQueue < String > ( ) ;
private static final wordLengthComparator wlComp = new wordLengthComparator ( ) ;
protected final Set < String > set ;
protected final IndexCell < WordReference > index ;
protected String word ;
protected int len ;
protected String word ;
protected int wordLen ;
protected LinkedBlockingQueue < String > guessGen , guessLib ;
protected long timeLimit ;
protected boolean createGen ; // keeps the value 'true' as long as no entry in guessLib is written
protected final Set < String > resultSet ;
/ * *
@ -46,12 +48,15 @@ public class DidYouMean {
* /
public DidYouMean ( final IndexCell < WordReference > index , boolean sort ) {
if ( sort )
this . set = Collections . synchronizedSortedSet ( new TreeSet < String > ( new word SizeComparator( ) ) ) ;
this . re sultS et = Collections . synchronizedSortedSet ( new TreeSet < String > ( new index SizeComparator( ) ) ) ;
else
this . set = Collections . synchronizedS et( new Hash Set< String > ( ) ) ;
this . re sultS et = Collections . synchronizedS ortedSet( new Tree Set< String > ( wlComp ) ) ;
this . word = "" ;
this . l en = 0 ;
this . wordL en = 0 ;
this . index = index ;
this . guessGen = new LinkedBlockingQueue < String > ( ) ;
this . guessLib = new LinkedBlockingQueue < String > ( ) ;
this . createGen = true ;
}
/ * *
@ -62,7 +67,7 @@ public class DidYouMean {
}
/ * *
* This method triggers the 4 producer and 8 consumer threads of DidYouMean .
* This method triggers the producer and consumer threads of DidYouMean .
* < p / > < b > Note : < / b > the default timeout is 500 ms
* @param word a String with a single word
* @return a Set & lt ; String & gt ; with word variations contained in index .
@ -72,70 +77,83 @@ public class DidYouMean {
}
/ * *
* This method triggers the 4 producer and 8 consumer threads of the DidYouMean object .
* This method triggers the producer and consumer threads of the DidYouMean object .
* @param word a String with a single word
* @param timeout execution time in ms .
* @return a Set & lt ; String & gt ; with word variations contained in term index .
* /
public Set < String > getSuggestion ( final String word , long timeout ) {
long startTime = System . currentTimeMillis ( ) ;
this . timeLimit = startTime + timeout ;
this . word = word . toLowerCase ( ) ;
this . len = word . length ( ) ;
this . wordLen = word . length ( ) ;
// create one consumer thread that checks the guessLib queue
// for occurrences in the index. If the producers are started next, their
// results can be consumers directly
Consumer [ ] consumers = new Consumer [ availableCPU ] ;
consumers [ 0 ] = new Consumer ( ) ;
consumers [ 0 ] . start ( ) ;
// get a single recommendation for the word without altering the word
Set < String > libr = LibraryProvider . dymLib . recommend ( word ) ;
for ( String t : libr ) {
if ( ! t . equals ( word ) ) try {
createGen = false ;
guessLib . put ( t ) ;
} catch ( InterruptedException e ) { }
}
// create producers
// the intention of the 4 producers is to mix results, as there
// is currently no default sorting or ranking due to the i/o performance of index.count()
// create and start producers
// the CPU load to create the guessed words is very low, but the testing
// against the library may be CPU intensive. Since it is possible to test
// words in the library concurrently, it is a good idea to start separate threads
Thread [ ] producers = new Thread [ 4 ] ;
producers [ 0 ] = new ChangingOneLetter ( ) ;
producers [ 1 ] = new AddingOneLetter ( ) ;
producers [ 2 ] = new DeletingOneLetter ( ) ;
producers [ 3 ] = new ReversingTwoConsecutiveLetters ( ) ;
// start producers
for ( int i = 0 ; i < producers . length ; i + + ) {
producers [ i ] . start ( ) ;
}
// create and start consumers threads
Thread [ ] consumers = new Thread [ availableCPU ] ;
for ( int i = 0 ; i < consumers . length ; i + + ) {
consumers [ i ] = new Consumer ( ) ;
consumers [ i ] . start ( ) ;
}
// check if timeout has been reached
boolean cont = false ;
while ( ( ( System . currentTimeMillis ( ) - startTime ) < timeout ) ) {
// checks if queue is already empty
if ( queue . size ( ) = = 0 ) {
// check if at least one producers is still running and potentially filling the queue
for ( int i = 0 ; i < producers . length ; i + + ) {
if ( producers [ i ] . isAlive ( ) )
cont = true ;
}
// as the queue is empty and no producer is running we can break the timeout-loop
if ( ! cont ) break ;
}
}
producers [ 2 ] = new DeletingOneLetter ( ) ;
producers [ 3 ] = new ReversingTwoConsecutiveLetters ( ) ;
for ( Thread t : producers ) t . start ( ) ;
// start more consumers if there are more cores
if ( consumers . length > 1 ) for ( int i = 1 ; i < consumers . length ; i + + ) {
consumers [ i ] = new Consumer ( ) ;
consumers [ i ] . start ( ) ;
}
// now decide which kind of guess is better
// we take guessLib entries as long as there is any entry in it
// to see if this is the case, we must wait for termination of the producer
for ( Thread t : producers ) try { t . join ( ) ; } catch ( InterruptedException e ) { }
// interrupt all consumer threads
for ( int i = 0 ; i < consumers . length ; i + + ) {
consumers [ i ] . interrupt ( ) ;
}
// if there is not any entry in guessLib, then transfer all entries from the
// guessGen to guessLib
if ( createGen ) try {
this . guessGen . put ( poisonString ) ;
String s ;
while ( ( s = this . guessGen . take ( ) ) ! = poisonString ) this . guessLib . put ( s ) ;
} catch ( InterruptedException e ) { }
// interrupt all remaining producer threads
for ( int i = 0 ; i < producers . length ; i + + ) {
producers [ i ] . interrupt ( ) ;
}
// put poison into guessLib to terminate consumers
for ( @SuppressWarnings ( "unused" ) Consumer c : consumers )
try { guessLib . put ( poisonString ) ; } catch ( InterruptedException e ) { }
this . set . remove ( word . toLowerCase ( ) ) ;
Log . logInfo ( "DidYouMean" , "found " + this . set . size ( ) + " terms; execution time: "
+ ( System . currentTimeMillis ( ) - startTime ) + "ms" + " - remaining queue size: " + queue . size ( ) ) ;
// wait for termination of consumer
for ( Consumer c : consumers )
try { c . join ( ) ; } catch ( InterruptedException e ) { }
return this . set ;
// we don't want the given word in the result
this . resultSet . remove ( word . toLowerCase ( ) ) ;
// finished
Log . logInfo ( "DidYouMean" , "found " + this . resultSet . size ( ) + " terms; execution time: "
+ ( System . currentTimeMillis ( ) - startTime ) + "ms" + " - remaining queue size: " + guessLib . size ( ) ) ;
return this . resultSet ;
}
/ * *
/ * *
* DidYouMean ' s producer thread that changes one letter ( e . g . bat / cat ) for a given term
* based on the given alphabet and puts it on the blocking queue , to be ' consumed ' by a consumer thread . < p / >
* < b > Note : < / b > the loop runs ( alphabet . length * len ) tests .
@ -144,18 +162,20 @@ public class DidYouMean {
public void run ( ) {
String s ;
for ( int i = 0 ; i < len ; i + + ) {
for ( int j = 0 ; j < alphabet . length ; j + + ) {
s = word . substring ( 0 , i ) + alphabet [ j ] + word . substring ( i + 1 ) ;
try {
queue . put ( s ) ;
} catch ( InterruptedException e ) {
return ;
}
Set < String > libr ;
for ( int i = 0 ; i < wordLen ; i + + ) try {
for ( char c : alphabet ) {
s = word . substring ( 0 , i ) + c + word . substring ( i + 1 ) ;
libr = LibraryProvider . dymLib . recommend ( s ) ;
if ( libr . size ( ) ! = 0 ) createGen = false ;
for ( String t : libr ) guessLib . put ( t ) ;
if ( createGen ) guessGen . put ( s ) ;
if ( System . currentTimeMillis ( ) > timeLimit ) return ;
}
}
} catch ( InterruptedException e ) { }
}
}
/ * *
* DidYouMean ' s producer thread that deletes extra letters ( e . g . frog / fog ) for a given term
* and puts it on the blocking queue , to be ' consumed ' by a consumer thread . < p / >
@ -165,16 +185,18 @@ public class DidYouMean {
public void run ( ) {
String s ;
for ( int i = 0 ; i < len ; i + + ) {
Set < String > libr ;
for ( int i = 0 ; i < wordLen ; i + + ) try {
s = word . substring ( 0 , i ) + word . substring ( i + 1 ) ;
try {
queue . put ( s ) ;
} catch ( InterruptedException e ) {
return ;
}
}
libr = LibraryProvider . dymLib . recommend ( s ) ;
if ( libr . size ( ) ! = 0 ) createGen = false ;
for ( String t : libr ) guessLib . put ( t ) ;
if ( createGen ) guessGen . put ( s ) ;
if ( System . currentTimeMillis ( ) > timeLimit ) return ;
} catch ( InterruptedException e ) { }
}
}
/ * *
* DidYouMean ' s producer thread that adds missing letters ( e . g . bat / boat ) for a given term
* based on the given alphabet and puts it on the blocking queue , to be ' consumed ' by a consumer thread . < p / >
@ -184,18 +206,20 @@ public class DidYouMean {
public void run ( ) {
String s ;
for ( int i = 0 ; i < = len ; i + + ) {
for ( int j = 0 ; j < alphabet . length ; j + + ) {
s = word . substring ( 0 , i ) + alphabet [ j ] + word . substring ( i ) ;
try {
queue . put ( s ) ;
} catch ( InterruptedException e ) {
return ;
}
Set < String > libr ;
for ( int i = 0 ; i < = wordLen ; i + + ) try {
for ( char c : alphabet ) {
s = word . substring ( 0 , i ) + c + word . substring ( i ) ;
libr = LibraryProvider . dymLib . recommend ( s ) ;
if ( libr . size ( ) ! = 0 ) createGen = false ;
for ( String t : libr ) guessLib . put ( t ) ;
if ( createGen ) guessGen . put ( s ) ;
if ( System . currentTimeMillis ( ) > timeLimit ) return ;
}
}
} catch ( InterruptedException e ) { }
}
}
/ * *
* DidYouMean ' s producer thread that reverses any two consecutive letters ( e . g . two / tow ) for a given term
* and puts it on the blocking queue , to be ' consumed ' by a consumer thread . < p / >
@ -205,54 +229,62 @@ public class DidYouMean {
public void run ( ) {
String s ;
for ( int i = 0 ; i < len - 1 ; i + + ) {
s = word . substring ( 0 , i ) + word . charAt ( i + 1 ) + word . charAt ( i ) + word . substring ( i + 2 ) ;
try {
queue . put ( s ) ;
} catch ( InterruptedException e ) {
return ;
}
}
Set < String > libr ;
for ( int i = 0 ; i < wordLen - 1 ; i + + ) try {
s = word . substring ( 0 , i ) + word . charAt ( i + 1 ) + word . charAt ( i ) + word . substring ( i + 2 ) ;
libr = LibraryProvider . dymLib . recommend ( s ) ;
if ( libr . size ( ) ! = 0 ) createGen = false ;
for ( String t : libr ) guessLib . put ( t ) ;
if ( createGen ) guessGen . put ( s ) ;
if ( System . currentTimeMillis ( ) > timeLimit ) return ;
} catch ( InterruptedException e ) { }
}
}
/ * *
* DidYouMean ' s consumer thread takes a String object ( term ) from the blocking queue
* and checks if it is contained in YaCy ' s RWI index . The thread recognizes "\n" as poison pill ! < p / >
* and checks if it is contained in YaCy ' s RWI index .
* < b > Note : < / b > this causes no or moderate i / o as it uses the efficient index . has ( ) method .
* /
class Consumer extends Thread {
public void run ( ) {
try {
while ( true ) {
String s = queue . take ( ) ;
if ( s . equals ( "\n" ) )
this . interrupt ( ) ;
else
consume ( s ) ;
}
} catch ( InterruptedException e ) {
return ;
}
}
void consume ( String s ) {
if ( index . has ( Word . word2hash ( s ) ) ) {
set . add ( s ) ;
}
String s ;
try {
while ( ( s = guessLib . take ( ) ) ! = poisonString ) {
if ( index . has ( Word . word2hash ( s ) ) ) resultSet . add ( s ) ;
if ( System . currentTimeMillis ( ) > timeLimit ) return ;
}
} catch ( InterruptedException e ) { }
}
}
/ * *
* word SizeComparator is used by DidYouMean to order terms by index . count ( ) < p / >
* indexSizeComparator is used by DidYouMean to order terms by index . count ( ) < p / >
* < b > Warning : < / b > this causes heavy i / o
* /
protected class word SizeComparator implements Comparator < String > {
protected class indexSizeComparator implements Comparator < String > {
public int compare ( final String o1 , final String o2 ) {
final Integer i1 = index . count ( Word . word2hash ( o1 ) ) ;
final Integer i2 = index . count ( Word . word2hash ( o2 ) ) ;
return i2 . compareTo ( i1 ) ;
final int i1 = index . count ( Word . word2hash ( o1 ) ) ;
final int i2 = index . count ( Word . word2hash ( o2 ) ) ;
if ( i1 = = i2 ) return o1 . compareTo ( o2 ) ;
return ( i1 < i2 ) ? 1 : - 1 ; // '<' is correct, because the largest count shall be ordered to be the first position in the result
}
}
/ * *
* wordLengthComparator is used by DidYouMean to order terms by the term length < p / >
* This is the default order if the indexSizeComparator is not used
* /
protected static class wordLengthComparator implements Comparator < String > {
public int compare ( final String o1 , final String o2 ) {
final int i1 = o1 . length ( ) ;
final int i2 = o2 . length ( ) ;
if ( i1 = = i2 ) return o1 . compareTo ( o2 ) ;
return ( i1 > i2 ) ? 1 : - 1 ; // '>' is correct, because the shortest word shall be first
}
}
}