From 3b9aaf9e9fdc371a6cb2b4c29d6ad970d7d3c505 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 2 Sep 2009 13:41:56 +0000 Subject: [PATCH] - inserted new library tests inside DidYouMean - some redesign of DidYouMean that was necessary to follow a special rule how a library should be used: - the library provides words that start or end with a test word which may be possibly also an empty set of words - all words that the DidYouMean produced with the four production rules are used to generate a set of library-completed words - if this process results in any words from the library, only library-genrated words are taken - if the is no library-generated word at all, take the artifial generated word - all words that result from these rules are tested against the index - the result is ordered using a lightweight comparator that prefers short words - a not-so-much-io test against the index is beeing prepared next - insered the library initialization into the switchboard git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6284 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/data/DidYouMean.java | 260 ++++++++++-------- .../anomic/kelondro/index/RowCollection.java | 2 +- source/de/anomic/kelondro/table/Table.java | 2 +- source/de/anomic/search/SearchEvent.java | 2 +- source/de/anomic/search/Switchboard.java | 5 + 5 files changed, 154 insertions(+), 117 deletions(-) diff --git a/source/de/anomic/data/DidYouMean.java b/source/de/anomic/data/DidYouMean.java index c2513ea8b..46398f0b1 100644 --- a/source/de/anomic/data/DidYouMean.java +++ b/source/de/anomic/data/DidYouMean.java @@ -2,7 +2,6 @@ package de.anomic.data; import java.util.Collections; import java.util.Comparator; -import java.util.HashSet; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.LinkedBlockingQueue; @@ -30,15 +29,18 @@ public class DidYouMean { protected static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p', 'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'}; - + private static final String poisonString = "\n"; public static final int availableCPU = Runtime.getRuntime().availableProcessors(); - final LinkedBlockingQueue queue = new LinkedBlockingQueue(); + private static final wordLengthComparator wlComp = new wordLengthComparator(); - protected final Set set; protected final IndexCell index; - protected String word; - protected int len; - + protected String word; + protected int wordLen; + protected LinkedBlockingQueue guessGen, guessLib; + protected long timeLimit; + protected boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written + protected final Set resultSet; + /** * @param index a termIndex - most likely retrieved from a switchboard object. @@ -46,12 +48,15 @@ public class DidYouMean { */ public DidYouMean(final IndexCell index, boolean sort) { if(sort) - this.set = Collections.synchronizedSortedSet(new TreeSet(new wordSizeComparator())); + this.resultSet = Collections.synchronizedSortedSet(new TreeSet(new indexSizeComparator())); else - this.set = Collections.synchronizedSet(new HashSet()); + this.resultSet = Collections.synchronizedSortedSet(new TreeSet(wlComp)); this.word = ""; - this.len = 0; + this.wordLen = 0; this.index = index; + this.guessGen = new LinkedBlockingQueue(); + this.guessLib = new LinkedBlockingQueue(); + this.createGen = true; } /** @@ -62,7 +67,7 @@ public class DidYouMean { } /** - * This method triggers the 4 producer and 8 consumer threads of DidYouMean. + * This method triggers the producer and consumer threads of DidYouMean. *

Note: the default timeout is 500ms * @param word a String with a single word * @return a Set<String> with word variations contained in index. @@ -72,70 +77,83 @@ public class DidYouMean { } /** - * This method triggers the 4 producer and 8 consumer threads of the DidYouMean object. + * This method triggers the producer and consumer threads of the DidYouMean object. * @param word a String with a single word * @param timeout execution time in ms. * @return a Set<String> with word variations contained in term index. */ public Set getSuggestion(final String word, long timeout) { long startTime = System.currentTimeMillis(); + this.timeLimit = startTime + timeout; this.word = word.toLowerCase(); - this.len = word.length(); + this.wordLen = word.length(); - // create producers - // the intention of the 4 producers is to mix results, as there - // is currently no default sorting or ranking due to the i/o performance of index.count() + // create one consumer thread that checks the guessLib queue + // for occurrences in the index. If the producers are started next, their + // results can be consumers directly + Consumer[] consumers = new Consumer[availableCPU]; + consumers[0] = new Consumer(); + consumers[0].start(); + + // get a single recommendation for the word without altering the word + Set libr = LibraryProvider.dymLib.recommend(word); + for (String t: libr) { + if (!t.equals(word)) try { + createGen = false; + guessLib.put(t); + } catch (InterruptedException e) {} + } + + // create and start producers + // the CPU load to create the guessed words is very low, but the testing + // against the library may be CPU intensive. Since it is possible to test + // words in the library concurrently, it is a good idea to start separate threads Thread[] producers = new Thread[4]; producers[0] = new ChangingOneLetter(); - producers[1] = new AddingOneLetter(); - producers[2] = new DeletingOneLetter(); - producers[3] = new ReversingTwoConsecutiveLetters(); - - // start producers - for (int i=0; i 1) for (int i = 1; i < consumers.length; i++) { + consumers[i] = new Consumer(); + consumers[i].start(); + } + + // now decide which kind of guess is better + // we take guessLib entries as long as there is any entry in it + // to see if this is the case, we must wait for termination of the producer + for (Thread t: producers) try { t.join(); } catch (InterruptedException e) {} + + // if there is not any entry in guessLib, then transfer all entries from the + // guessGen to guessLib + if (createGen) try { + this.guessGen.put(poisonString); + String s; + while ((s = this.guessGen.take()) != poisonString) this.guessLib.put(s); + } catch (InterruptedException e) {} + + // put poison into guessLib to terminate consumers + for (@SuppressWarnings("unused") Consumer c: consumers) + try { guessLib.put(poisonString); } catch (InterruptedException e) {} + + // wait for termination of consumer + for (Consumer c: consumers) + try { c.join(); } catch (InterruptedException e) {} - // check if timeout has been reached - boolean cont = false; - while(((System.currentTimeMillis()-startTime) < timeout)) { - // checks if queue is already empty - if(queue.size()==0) { - // check if at least one producers is still running and potentially filling the queue - for (int i=0; i * Note: the loop runs (alphabet.length * len) tests. @@ -144,18 +162,20 @@ public class DidYouMean { public void run() { String s; - for(int i=0; i libr; + for (int i = 0; i < wordLen; i++) try { + for (char c: alphabet) { + s = word.substring(0, i) + c + word.substring(i + 1); + libr = LibraryProvider.dymLib.recommend(s); + if (libr.size() != 0) createGen = false; + for (String t: libr) guessLib.put(t); + if (createGen) guessGen.put(s); + if (System.currentTimeMillis() > timeLimit) return; } - } + } catch (InterruptedException e) {} } } + /** * DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term * and puts it on the blocking queue, to be 'consumed' by a consumer thread.

@@ -165,16 +185,18 @@ public class DidYouMean { public void run() { String s; - for(int i=0; i libr; + for (int i = 0; i < wordLen; i++) try { s = word.substring(0, i) + word.substring(i+1); - try { - queue.put(s); - } catch (InterruptedException e) { - return; - } - } + libr = LibraryProvider.dymLib.recommend(s); + if (libr.size() != 0) createGen = false; + for (String t: libr) guessLib.put(t); + if (createGen) guessGen.put(s); + if (System.currentTimeMillis() > timeLimit) return; + } catch (InterruptedException e) {} } } + /** * DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term * based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.

@@ -184,18 +206,20 @@ public class DidYouMean { public void run() { String s; - for(int i=0; i<=len;i++) { - for(int j=0; j libr; + for (int i = 0; i <= wordLen; i++) try { + for (char c: alphabet) { + s = word.substring(0, i) + c + word.substring(i); + libr = LibraryProvider.dymLib.recommend(s); + if (libr.size() != 0) createGen = false; + for (String t: libr) guessLib.put(t); + if (createGen) guessGen.put(s); + if (System.currentTimeMillis() > timeLimit) return; } - } + } catch (InterruptedException e) {} } } + /** * DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term * and puts it on the blocking queue, to be 'consumed' by a consumer thread.

@@ -205,53 +229,61 @@ public class DidYouMean { public void run() { String s; - for(int i=0; i libr; + for (int i = 0; i < wordLen - 1; i++) try { + s = word.substring(0, i) + word.charAt(i + 1) + word.charAt(i) + word.substring(i +2); + libr = LibraryProvider.dymLib.recommend(s); + if (libr.size() != 0) createGen = false; + for (String t: libr) guessLib.put(t); + if (createGen) guessGen.put(s); + if (System.currentTimeMillis() > timeLimit) return; + } catch (InterruptedException e) {} } } + /** * DidYouMean's consumer thread takes a String object (term) from the blocking queue - * and checks if it is contained in YaCy's RWI index. The thread recognizes "\n" as poison pill!

+ * and checks if it is contained in YaCy's RWI index. * Note: this causes no or moderate i/o as it uses the efficient index.has() method. */ class Consumer extends Thread { public void run() { - try { - while(true) { - String s = queue.take(); - if(s.equals("\n")) - this.interrupt(); - else - consume(s); - } - } catch (InterruptedException e) { - return; - } - } - void consume(String s) { - if (index.has(Word.word2hash(s))) { - set.add(s); - } + String s; + try { + while ((s = guessLib.take()) != poisonString) { + if (index.has(Word.word2hash(s))) resultSet.add(s); + if (System.currentTimeMillis() > timeLimit) return; + } + } catch (InterruptedException e) {} } } + /** - * wordSizeComparator is used by DidYouMean to order terms by index.count()

+ * indexSizeComparator is used by DidYouMean to order terms by index.count()

* Warning: this causes heavy i/o */ - protected class wordSizeComparator implements Comparator { + protected class indexSizeComparator implements Comparator { public int compare(final String o1, final String o2) { - final Integer i1 = index.count(Word.word2hash(o1)); - final Integer i2 = index.count(Word.word2hash(o2)); - return i2.compareTo(i1); + final int i1 = index.count(Word.word2hash(o1)); + final int i2 = index.count(Word.word2hash(o2)); + if (i1 == i2) return o1.compareTo(o2); + return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result } } + + /** + * wordLengthComparator is used by DidYouMean to order terms by the term length

+ * This is the default order if the indexSizeComparator is not used + */ + protected static class wordLengthComparator implements Comparator { + public int compare(final String o1, final String o2) { + final int i1 = o1.length(); + final int i2 = o2.length(); + if (i1 == i2) return o1.compareTo(o2); + return (i1 > i2) ? 1 : -1; // '>' is correct, because the shortest word shall be first + } + } } diff --git a/source/de/anomic/kelondro/index/RowCollection.java b/source/de/anomic/kelondro/index/RowCollection.java index c481dfe19..32383d110 100644 --- a/source/de/anomic/kelondro/index/RowCollection.java +++ b/source/de/anomic/kelondro/index/RowCollection.java @@ -219,7 +219,7 @@ public class RowCollection implements Iterable { * @return */ public final long memoryNeededForGrow() { - return (long) ((((long) (chunkcount + 1)) * ((long) rowdef.objectsize)) * growfactor100 / 100L); + return (((long) (chunkcount + 1)) * ((long) rowdef.objectsize)) * growfactor100 / 100L; } public synchronized void trim(final boolean plusGrowFactor) { diff --git a/source/de/anomic/kelondro/table/Table.java b/source/de/anomic/kelondro/table/Table.java index 05b7c30e6..16404fe53 100644 --- a/source/de/anomic/kelondro/table/Table.java +++ b/source/de/anomic/kelondro/table/Table.java @@ -277,7 +277,7 @@ public class Table implements ObjectIndex { } public static int staticRAMIndexNeed(final File f, final Row rowdef) { - return (int) (((long)(rowdef.primaryKeyLength + 4)) * ((long) tableSize(f, rowdef.objectsize)) * RowCollection.growfactor100 / 100L); + return (int) (((long)(rowdef.primaryKeyLength + 4)) * tableSize(f, rowdef.objectsize) * RowCollection.growfactor100 / 100L); } public synchronized void addUnique(final Entry row) throws IOException { diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index f9def9b0c..9054e7ba0 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -116,7 +116,7 @@ public final class SearchEvent { final long timer = System.currentTimeMillis(); final int fetchpeers = 12; Log.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs"); - this.primarySearchThreads = yacySearch.primaryRemoteSearches( + this.primarySearchThreads = (query.queryHashes.size() == 0) ? null : yacySearch.primaryRemoteSearches( QueryParams.hashSet2hashString(query.queryHashes), QueryParams.hashSet2hashString(query.excludeHashes), "", diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index da1eab1b4..6b64a8682 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -132,6 +132,7 @@ import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.Response; import de.anomic.data.Blacklist; +import de.anomic.data.LibraryProvider; import de.anomic.data.URLLicense; import de.anomic.data.blogBoard; import de.anomic.data.blogBoardComments; @@ -314,6 +315,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi this.workPath = getConfigPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT); this.log.logConfig("Work Path: " + this.workPath.toString()); + // init libraries + this.log.logConfig("initializing libraries"); + LibraryProvider.initialize(rootPath, new File(rootPath, "dictionaries")); + // set a high maximum cache size to current size; this is adopted later automatically final int wordCacheMaxCount = (int) getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000); setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));