- inserted new library tests inside DidYouMean

- some redesign of DidYouMean that was necessary to follow
  a special rule how a library should be used:
  - the library provides words that start or end with a test
    word which may be possibly also an empty set of words
  - all words that the DidYouMean produced with the four
    production rules are used to generate a set of
    library-completed words
  - if this process results in any words from the library,
    only library-genrated words are taken
  - if the is no library-generated word at all, take the
    artifial generated word
  - all words that result from these rules are tested against
    the index
  - the result is ordered using a lightweight comparator that
    prefers short words
  - a not-so-much-io test against the index is beeing prepared
    next
- insered the library initialization into the switchboard

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6284 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 8c35ffe34c
commit 3b9aaf9e9f

@ -2,7 +2,6 @@ package de.anomic.data;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
@ -30,15 +29,18 @@ public class DidYouMean {
protected static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p', protected static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'}; 'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'};
private static final String poisonString = "\n";
public static final int availableCPU = Runtime.getRuntime().availableProcessors(); public static final int availableCPU = Runtime.getRuntime().availableProcessors();
final LinkedBlockingQueue<String> queue = new LinkedBlockingQueue<String>(); private static final wordLengthComparator wlComp = new wordLengthComparator();
protected final Set<String> set;
protected final IndexCell<WordReference> index; protected final IndexCell<WordReference> index;
protected String word; protected String word;
protected int len; protected int wordLen;
protected LinkedBlockingQueue<String> guessGen, guessLib;
protected long timeLimit;
protected boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written
protected final Set<String> resultSet;
/** /**
* @param index a termIndex - most likely retrieved from a switchboard object. * @param index a termIndex - most likely retrieved from a switchboard object.
@ -46,12 +48,15 @@ public class DidYouMean {
*/ */
public DidYouMean(final IndexCell<WordReference> index, boolean sort) { public DidYouMean(final IndexCell<WordReference> index, boolean sort) {
if(sort) if(sort)
this.set = Collections.synchronizedSortedSet(new TreeSet<String>(new wordSizeComparator())); this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
else else
this.set = Collections.synchronizedSet(new HashSet<String>()); this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(wlComp));
this.word = ""; this.word = "";
this.len = 0; this.wordLen = 0;
this.index = index; this.index = index;
this.guessGen = new LinkedBlockingQueue<String>();
this.guessLib = new LinkedBlockingQueue<String>();
this.createGen = true;
} }
/** /**
@ -62,7 +67,7 @@ public class DidYouMean {
} }
/** /**
* This method triggers the 4 producer and 8 consumer threads of DidYouMean. * This method triggers the producer and consumer threads of DidYouMean.
* <p/><b>Note:</b> the default timeout is 500ms * <p/><b>Note:</b> the default timeout is 500ms
* @param word a String with a single word * @param word a String with a single word
* @return a Set&lt;String&gt; with word variations contained in index. * @return a Set&lt;String&gt; with word variations contained in index.
@ -72,70 +77,83 @@ public class DidYouMean {
} }
/** /**
* This method triggers the 4 producer and 8 consumer threads of the DidYouMean object. * This method triggers the producer and consumer threads of the DidYouMean object.
* @param word a String with a single word * @param word a String with a single word
* @param timeout execution time in ms. * @param timeout execution time in ms.
* @return a Set&lt;String&gt; with word variations contained in term index. * @return a Set&lt;String&gt; with word variations contained in term index.
*/ */
public Set<String> getSuggestion(final String word, long timeout) { public Set<String> getSuggestion(final String word, long timeout) {
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
this.timeLimit = startTime + timeout;
this.word = word.toLowerCase(); this.word = word.toLowerCase();
this.len = word.length(); this.wordLen = word.length();
// create producers // create one consumer thread that checks the guessLib queue
// the intention of the 4 producers is to mix results, as there // for occurrences in the index. If the producers are started next, their
// is currently no default sorting or ranking due to the i/o performance of index.count() // results can be consumers directly
Consumer[] consumers = new Consumer[availableCPU];
consumers[0] = new Consumer();
consumers[0].start();
// get a single recommendation for the word without altering the word
Set<String> libr = LibraryProvider.dymLib.recommend(word);
for (String t: libr) {
if (!t.equals(word)) try {
createGen = false;
guessLib.put(t);
} catch (InterruptedException e) {}
}
// create and start producers
// the CPU load to create the guessed words is very low, but the testing
// against the library may be CPU intensive. Since it is possible to test
// words in the library concurrently, it is a good idea to start separate threads
Thread[] producers = new Thread[4]; Thread[] producers = new Thread[4];
producers[0] = new ChangingOneLetter(); producers[0] = new ChangingOneLetter();
producers[1] = new AddingOneLetter(); producers[1] = new AddingOneLetter();
producers[2] = new DeletingOneLetter(); producers[2] = new DeletingOneLetter();
producers[3] = new ReversingTwoConsecutiveLetters(); producers[3] = new ReversingTwoConsecutiveLetters();
for (Thread t: producers) t.start();
// start producers
for (int i=0; i<producers.length; i++) {
producers[i].start();
}
// create and start consumers threads // start more consumers if there are more cores
Thread[] consumers = new Thread[availableCPU]; if (consumers.length > 1) for (int i = 1; i < consumers.length; i++) {
for (int i=0; i<consumers.length; i++) { consumers[i] = new Consumer();
consumers[i] = new Consumer(); consumers[i].start();
consumers[i].start(); }
}
// now decide which kind of guess is better
// we take guessLib entries as long as there is any entry in it
// to see if this is the case, we must wait for termination of the producer
for (Thread t: producers) try { t.join(); } catch (InterruptedException e) {}
// if there is not any entry in guessLib, then transfer all entries from the
// guessGen to guessLib
if (createGen) try {
this.guessGen.put(poisonString);
String s;
while ((s = this.guessGen.take()) != poisonString) this.guessLib.put(s);
} catch (InterruptedException e) {}
// put poison into guessLib to terminate consumers
for (@SuppressWarnings("unused") Consumer c: consumers)
try { guessLib.put(poisonString); } catch (InterruptedException e) {}
// wait for termination of consumer
for (Consumer c: consumers)
try { c.join(); } catch (InterruptedException e) {}
// check if timeout has been reached // we don't want the given word in the result
boolean cont = false; this.resultSet.remove(word.toLowerCase());
while(((System.currentTimeMillis()-startTime) < timeout)) {
// checks if queue is already empty
if(queue.size()==0) {
// check if at least one producers is still running and potentially filling the queue
for (int i=0; i<producers.length; i++) {
if(producers[i].isAlive())
cont = true;
}
// as the queue is empty and no producer is running we can break the timeout-loop
if(!cont) break;
}
}
// interrupt all consumer threads // finished
for (int i=0; i<consumers.length; i++) { Log.logInfo("DidYouMean", "found "+this.resultSet.size()+" terms; execution time: "
consumers[i].interrupt(); +(System.currentTimeMillis()-startTime)+"ms"+ " - remaining queue size: "+guessLib.size());
}
// interrupt all remaining producer threads
for (int i=0; i<producers.length; i++) {
producers[i].interrupt();
}
this.set.remove(word.toLowerCase());
Log.logInfo("DidYouMean", "found "+this.set.size()+" terms; execution time: "
+(System.currentTimeMillis()-startTime)+"ms"+ " - remaining queue size: "+queue.size());
return this.set; return this.resultSet;
} }
/**
/**
* DidYouMean's producer thread that changes one letter (e.g. bat/cat) for a given term * DidYouMean's producer thread that changes one letter (e.g. bat/cat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/> * based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests. * <b>Note:</b> the loop runs (alphabet.length * len) tests.
@ -144,18 +162,20 @@ public class DidYouMean {
public void run() { public void run() {
String s; String s;
for(int i=0; i<len; i++) { Set<String> libr;
for(int j=0; j<alphabet.length; j++) { for (int i = 0; i < wordLen; i++) try {
s = word.substring(0, i) + alphabet[j] + word.substring(i+1); for (char c: alphabet) {
try { s = word.substring(0, i) + c + word.substring(i + 1);
queue.put(s); libr = LibraryProvider.dymLib.recommend(s);
} catch (InterruptedException e) { if (libr.size() != 0) createGen = false;
return; for (String t: libr) guessLib.put(t);
} if (createGen) guessGen.put(s);
if (System.currentTimeMillis() > timeLimit) return;
} }
} } catch (InterruptedException e) {}
} }
} }
/** /**
* DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term * DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/> * and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
@ -165,16 +185,18 @@ public class DidYouMean {
public void run() { public void run() {
String s; String s;
for(int i=0; i<len;i++) { Set<String> libr;
for (int i = 0; i < wordLen; i++) try {
s = word.substring(0, i) + word.substring(i+1); s = word.substring(0, i) + word.substring(i+1);
try { libr = LibraryProvider.dymLib.recommend(s);
queue.put(s); if (libr.size() != 0) createGen = false;
} catch (InterruptedException e) { for (String t: libr) guessLib.put(t);
return; if (createGen) guessGen.put(s);
} if (System.currentTimeMillis() > timeLimit) return;
} } catch (InterruptedException e) {}
} }
} }
/** /**
* DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term * DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/> * based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
@ -184,18 +206,20 @@ public class DidYouMean {
public void run() { public void run() {
String s; String s;
for(int i=0; i<=len;i++) { Set<String> libr;
for(int j=0; j<alphabet.length; j++) { for (int i = 0; i <= wordLen; i++) try {
s = word.substring(0, i) + alphabet[j] + word.substring(i); for (char c: alphabet) {
try { s = word.substring(0, i) + c + word.substring(i);
queue.put(s); libr = LibraryProvider.dymLib.recommend(s);
} catch (InterruptedException e) { if (libr.size() != 0) createGen = false;
return; for (String t: libr) guessLib.put(t);
} if (createGen) guessGen.put(s);
if (System.currentTimeMillis() > timeLimit) return;
} }
} } catch (InterruptedException e) {}
} }
} }
/** /**
* DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term * DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/> * and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
@ -205,53 +229,61 @@ public class DidYouMean {
public void run() { public void run() {
String s; String s;
for(int i=0; i<len-1; i++) { Set<String> libr;
s = word.substring(0,i)+word.charAt(i+1)+word.charAt(i)+word.substring(i+2); for (int i = 0; i < wordLen - 1; i++) try {
try { s = word.substring(0, i) + word.charAt(i + 1) + word.charAt(i) + word.substring(i +2);
queue.put(s); libr = LibraryProvider.dymLib.recommend(s);
} catch (InterruptedException e) { if (libr.size() != 0) createGen = false;
return; for (String t: libr) guessLib.put(t);
} if (createGen) guessGen.put(s);
} if (System.currentTimeMillis() > timeLimit) return;
} catch (InterruptedException e) {}
} }
} }
/** /**
* DidYouMean's consumer thread takes a String object (term) from the blocking queue * DidYouMean's consumer thread takes a String object (term) from the blocking queue
* and checks if it is contained in YaCy's RWI index. The thread recognizes "\n" as poison pill!<p/> * and checks if it is contained in YaCy's RWI index.
* <b>Note:</b> this causes no or moderate i/o as it uses the efficient index.has() method. * <b>Note:</b> this causes no or moderate i/o as it uses the efficient index.has() method.
*/ */
class Consumer extends Thread { class Consumer extends Thread {
public void run() { public void run() {
try { String s;
while(true) { try {
String s = queue.take(); while ((s = guessLib.take()) != poisonString) {
if(s.equals("\n")) if (index.has(Word.word2hash(s))) resultSet.add(s);
this.interrupt(); if (System.currentTimeMillis() > timeLimit) return;
else }
consume(s); } catch (InterruptedException e) {}
}
} catch (InterruptedException e) {
return;
}
}
void consume(String s) {
if (index.has(Word.word2hash(s))) {
set.add(s);
}
} }
} }
/** /**
* wordSizeComparator is used by DidYouMean to order terms by index.count()<p/> * indexSizeComparator is used by DidYouMean to order terms by index.count()<p/>
* <b>Warning:</b> this causes heavy i/o * <b>Warning:</b> this causes heavy i/o
*/ */
protected class wordSizeComparator implements Comparator<String> { protected class indexSizeComparator implements Comparator<String> {
public int compare(final String o1, final String o2) { public int compare(final String o1, final String o2) {
final Integer i1 = index.count(Word.word2hash(o1)); final int i1 = index.count(Word.word2hash(o1));
final Integer i2 = index.count(Word.word2hash(o2)); final int i2 = index.count(Word.word2hash(o2));
return i2.compareTo(i1); if (i1 == i2) return o1.compareTo(o2);
return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result
} }
} }
/**
* wordLengthComparator is used by DidYouMean to order terms by the term length<p/>
* This is the default order if the indexSizeComparator is not used
*/
protected static class wordLengthComparator implements Comparator<String> {
public int compare(final String o1, final String o2) {
final int i1 = o1.length();
final int i2 = o2.length();
if (i1 == i2) return o1.compareTo(o2);
return (i1 > i2) ? 1 : -1; // '>' is correct, because the shortest word shall be first
}
}
} }

@ -219,7 +219,7 @@ public class RowCollection implements Iterable<Row.Entry> {
* @return * @return
*/ */
public final long memoryNeededForGrow() { public final long memoryNeededForGrow() {
return (long) ((((long) (chunkcount + 1)) * ((long) rowdef.objectsize)) * growfactor100 / 100L); return (((long) (chunkcount + 1)) * ((long) rowdef.objectsize)) * growfactor100 / 100L;
} }
public synchronized void trim(final boolean plusGrowFactor) { public synchronized void trim(final boolean plusGrowFactor) {

@ -277,7 +277,7 @@ public class Table implements ObjectIndex {
} }
public static int staticRAMIndexNeed(final File f, final Row rowdef) { public static int staticRAMIndexNeed(final File f, final Row rowdef) {
return (int) (((long)(rowdef.primaryKeyLength + 4)) * ((long) tableSize(f, rowdef.objectsize)) * RowCollection.growfactor100 / 100L); return (int) (((long)(rowdef.primaryKeyLength + 4)) * tableSize(f, rowdef.objectsize) * RowCollection.growfactor100 / 100L);
} }
public synchronized void addUnique(final Entry row) throws IOException { public synchronized void addUnique(final Entry row) throws IOException {

@ -116,7 +116,7 @@ public final class SearchEvent {
final long timer = System.currentTimeMillis(); final long timer = System.currentTimeMillis();
final int fetchpeers = 12; final int fetchpeers = 12;
Log.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs"); Log.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs");
this.primarySearchThreads = yacySearch.primaryRemoteSearches( this.primarySearchThreads = (query.queryHashes.size() == 0) ? null : yacySearch.primaryRemoteSearches(
QueryParams.hashSet2hashString(query.queryHashes), QueryParams.hashSet2hashString(query.queryHashes),
QueryParams.hashSet2hashString(query.excludeHashes), QueryParams.hashSet2hashString(query.excludeHashes),
"", "",

@ -132,6 +132,7 @@ import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist; import de.anomic.data.Blacklist;
import de.anomic.data.LibraryProvider;
import de.anomic.data.URLLicense; import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard; import de.anomic.data.blogBoard;
import de.anomic.data.blogBoardComments; import de.anomic.data.blogBoardComments;
@ -314,6 +315,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
this.workPath = getConfigPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT); this.workPath = getConfigPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT);
this.log.logConfig("Work Path: " + this.workPath.toString()); this.log.logConfig("Work Path: " + this.workPath.toString());
// init libraries
this.log.logConfig("initializing libraries");
LibraryProvider.initialize(rootPath, new File(rootPath, "dictionaries"));
// set a high maximum cache size to current size; this is adopted later automatically // set a high maximum cache size to current size; this is adopted later automatically
final int wordCacheMaxCount = (int) getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000); final int wordCacheMaxCount = (int) getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount)); setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));

Loading…
Cancel
Save