diff --git a/htroot/suggest.java b/htroot/suggest.java index 2c8e93a24..5582b0589 100644 --- a/htroot/suggest.java +++ b/htroot/suggest.java @@ -22,8 +22,8 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import java.util.Collection; import java.util.ConcurrentModificationException; -import java.util.SortedSet; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -68,7 +68,7 @@ public class suggest { int c = 0; final DidYouMean didYouMean = new DidYouMean(sb.index, new StringBuilder(querystring)); - final SortedSet suggestions = didYouMean.getSuggestions(timeout, count); + final Collection suggestions = didYouMean.getSuggestions(timeout, count); //[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]] synchronized (suggestions) { for (StringBuilder suggestion: suggestions) { diff --git a/source/net/yacy/cora/util/StringBuilderComparator.java b/source/net/yacy/cora/util/StringBuilderComparator.java index 430506c1f..e4c1d77a9 100644 --- a/source/net/yacy/cora/util/StringBuilderComparator.java +++ b/source/net/yacy/cora/util/StringBuilderComparator.java @@ -65,19 +65,26 @@ public class StringBuilderComparator implements Comparator { final int l0 = sb0.length(); final int l1 = sb1.length(); if (l0 != l1) return false; - return equals(sb0, sb1, l1); + return equals(sb0, sb1, 0, l1); } public boolean startsWith(final StringBuilder sb0, final StringBuilder sb1) { final int l0 = sb0.length(); final int l1 = sb1.length(); if (l0 < l1) return false; - return equals(sb0, sb1, l1); + return equals(sb0, sb1, 0, l1); } - private boolean equals(final StringBuilder sb0, final StringBuilder sb1, final int l) { + public boolean endsWith(final StringBuilder sb0, final StringBuilder sb1) { + final int l0 = sb0.length(); + final int l1 = sb1.length(); + if (l0 < l1) return false; + return equals(sb0, sb1, l0 - l1, l1); + } + + private boolean equals(final StringBuilder sb0, final StringBuilder sb1, int start, final int l) { char c0, c1; - for (int i = 0; i < l; i++) { + for (int i = start; i < l; i++) { c0 = sb0.charAt(i); c1 = sb1.charAt(i); if (c0 == c1) continue; diff --git a/source/net/yacy/data/DidYouMean.java b/source/net/yacy/data/DidYouMean.java index b38a5eff6..f0050173b 100644 --- a/source/net/yacy/data/DidYouMean.java +++ b/source/net/yacy/data/DidYouMean.java @@ -1,6 +1,8 @@ package net.yacy.data; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.ConcurrentModificationException; @@ -10,7 +12,6 @@ import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; -import java.util.concurrent.LinkedBlockingQueue; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.response.QueryResponse; @@ -76,17 +77,14 @@ public class DidYouMean { private static final char[][] ALPHABETS = { ALPHABET_LATIN, ALPHABET_KANJI, ALPHABET_HIRAGANA, ALPHABET_KATAKANA, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4}; - private static final StringBuilder POISON_STRING = new StringBuilder("\n"); public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors(); private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator(); private final Segment segment; private final StringBuilder word; private final int wordLen; - private final LinkedBlockingQueue guesses; private long timeLimit; private final SortedSet resultSet; - private final indexSizeComparator INDEX_SIZE_COMPARATOR; private char[] alphabet; private boolean more; @@ -99,8 +97,6 @@ public class DidYouMean { this.word = word0; this.wordLen = this.word.length(); this.segment = segment; - this.guesses = new LinkedBlockingQueue(); - this.INDEX_SIZE_COMPARATOR = new indexSizeComparator(); this.more = segment.connectedRWI() && segment.RWICount() > 0; // with RWIs connected the guessing is super-fast // identify language @@ -144,7 +140,6 @@ public class DidYouMean { public void reset() { this.resultSet.clear(); - this.guesses.clear(); } /** @@ -155,7 +150,7 @@ public class DidYouMean { * @param preSortSelection the number of words that participate in the IO-intensive sort * @return */ - public SortedSet getSuggestions(final long timeout, final int preSortSelection) { + public Collection getSuggestions(final long timeout, final int preSortSelection) { if (this.word.length() < MinimumInputWordLength) { return this.resultSet; // return nothing if input is too short } @@ -167,47 +162,33 @@ public class DidYouMean { return getSuggestions(this.word.substring(0, lastIndexOfSpace), this.word.substring(lastIndexOfSpace + 1), timeout, preSortSelection, this.segment); } final SortedSet preSorted = getSuggestions(timeout); - /* - if (System.currentTimeMillis() > timelimit) { - ConcurrentLog.info("DidYouMean", "found and returned " + preSorted.size() + " unsorted suggestions (1); execution time: " - + (System.currentTimeMillis() - startTime) + "ms"); - return preSorted; - } -*/ final ReversibleScoreMap scored = new ClusteredScoreMap(StringBuilderComparator.CASE_INSENSITIVE_ORDER); - try { - for (final StringBuilder s: preSorted) { - if (System.currentTimeMillis() > timelimit) { - break; - } - if (!(scored.sizeSmaller(2 * preSortSelection))) { - break; - } - scored.inc(s, this.segment.getWordCountGuess(s.toString())); - } - } catch (final ConcurrentModificationException e) { - } - final SortedSet countSorted = Collections.synchronizedSortedSet(new TreeSet(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR))); - final int wc = this.segment.getWordCountGuess(this.word.toString()); // all counts must be greater than this - while (!scored.isEmpty() && countSorted.size() < preSortSelection) { - final StringBuilder s = scored.getMaxKey(); - final int score = scored.delete(s); - if (s.length() >= MinimumOutputWordLength && score > wc) { - countSorted.add(s); + Collection countSorted = new ArrayList(); + if (this.more) { + final int wc = this.segment.getWordCountGuess(this.word.toString()); // all counts must be greater than this + try { + for (final StringBuilder s: preSorted) { + if (System.currentTimeMillis() > timelimit) break; + if (!(scored.sizeSmaller(2 * preSortSelection))) break; + String s0 = s.toString(); + int wcg = s0.indexOf(' ') > 0 ? s0.length() * 100 : this.segment.getWordCountGuess(s0); + if (wcg > wc) scored.inc(s, wcg); + } + } catch (final ConcurrentModificationException e) { } - if (System.currentTimeMillis() > timelimit) { - break; + Iterator i = scored.keys(false); + while (i.hasNext()) countSorted.add(i.next()); + } else { + try { + for (final StringBuilder s: preSorted) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(s, this.word) || + StringBuilderComparator.CASE_INSENSITIVE_ORDER.endsWith(this.word, s)) countSorted.add(s); + } + } catch (final ConcurrentModificationException e) { } } // finished - /* - if (countSorted.isEmpty()) { - ConcurrentLog.info("DidYouMean", "found and returned " + preSorted.size() + " unsorted suggestions (2); execution time: " - + (System.currentTimeMillis() - startTime) + "ms"); - return preSorted; - } - */ ConcurrentLog.info("DidYouMean", "found " + preSorted.size() + " unsorted terms, returned " + countSorted.size() + " sorted suggestions; execution time: " + (System.currentTimeMillis() - startTime) + "ms"); @@ -222,7 +203,7 @@ public class DidYouMean { * @param preSortSelection * @return */ - private static SortedSet getSuggestions(final String head, final String tail, final long timeout, final int preSortSelection, final Segment segment) { + private static Collection getSuggestions(final String head, final String tail, final long timeout, final int preSortSelection, final Segment segment) { final SortedSet result = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); int count = 30; final SolrQuery solrQuery = new SolrQuery(); @@ -239,7 +220,6 @@ public class DidYouMean { solrQuery.addHighlightField(CollectionSchema.title.getSolrFieldName()); solrQuery.addHighlightField(CollectionSchema.text_t.getSolrFieldName()); solrQuery.setFields(); // no fields wanted! only snippets - //List snippets = new ArrayList(); OrderedScoreMap snippets = new OrderedScoreMap(null); try { QueryResponse response = segment.fulltext().getDefaultConnector().getResponseByParams(solrQuery); @@ -313,108 +293,68 @@ public class DidYouMean { private SortedSet getSuggestions(final long timeout) { final long startTime = System.currentTimeMillis(); this.timeLimit = startTime + timeout; - - // create one consumer thread that checks the guessLib queue - // for occurrences in the index. If the producers are started next, their - // results can be consumers directly - final Consumer[] consumers = new Consumer[AVAILABLE_CPU]; - consumers[0] = new Consumer(); - consumers[0].start(); - - // get a single recommendation for the word without altering the word - final Set libr = LibraryProvider.dymLib.recommend(this.word); - for (final StringBuilder t: libr) { - if (!t.equals(this.word)) { - try { - this.guesses.put(t); - } catch (final InterruptedException e) {} + + Thread[] producers = null; + if (this.more) { + // create and start producers + // the CPU load to create the guessed words is very low, but the testing + // against the library may be CPU intensive. Since it is possible to test + // words in the library concurrently, it is a good idea to start separate threads + producers = new Thread[4]; + producers[0] = new ChangingOneLetter(); + producers[1] = new AddingOneLetter(); + producers[2] = new DeletingOneLetter(); + producers[3] = new ReversingTwoConsecutiveLetters(); + for (final Thread t: producers) { + t.start(); } } - // create and start producers - // the CPU load to create the guessed words is very low, but the testing - // against the library may be CPU intensive. Since it is possible to test - // words in the library concurrently, it is a good idea to start separate threads - final Thread[] producers = new Thread[4]; - producers[0] = new ChangingOneLetter(); - producers[1] = new AddingOneLetter(); - producers[2] = new DeletingOneLetter(); - producers[3] = new ReversingTwoConsecutiveLetters(); - for (final Thread t: producers) { - t.start(); - } - - // start more consumers if there are more cores - if (consumers.length > 1) { - for (int i = 1; i < consumers.length; i++) { - consumers[i] = new Consumer(); - consumers[i].start(); + test(this.word); + this.resultSet.addAll(getSuggestions(this.word.toString(), "", timeout, 10, this.segment)); + + if (this.more) { + // finish the producer + for (final Thread t: producers) { + long wait = this.timeLimit - System.currentTimeMillis(); + if (wait > 0) try { + t.join(wait); + } catch (final InterruptedException e) {} } } - - // now decide which kind of guess is better - // we take guessLib entries as long as there is any entry in it - // to see if this is the case, we must wait for termination of the producer - for (final Thread t: producers) { - long wait = this.timeLimit - System.currentTimeMillis(); - if (wait > 0) try { - t.join(wait); - } catch (final InterruptedException e) {} - } - - // put poison into guessLib to terminate consumers - for (@SuppressWarnings("unused") final Consumer c: consumers) { - try { this.guesses.put(POISON_STRING); } catch (final InterruptedException e) {} - } - - // wait for termination of consumer - for (final Consumer c: consumers) { - long wait = this.timeLimit - System.currentTimeMillis(); - if (wait > 0) try { - c.join(wait); - } catch (final InterruptedException e) {} - if (c.isAlive()) c.interrupt(); - } - + // we don't want the given word in the result this.resultSet.remove(this.word); - return this.resultSet; - } - private void test(final StringBuilder s) throws InterruptedException { + private void test(final StringBuilder s) { final Set libr = LibraryProvider.dymLib.recommend(s); libr.addAll(LibraryProvider.geoLoc.recommend(s)); for (final StringBuilder t: libr) { - this.guesses.put(t); + if (t.length() >= MinimumOutputWordLength) this.resultSet.add(t); } - this.guesses.add(s); + if (s.length() >= MinimumOutputWordLength) this.resultSet.add(s); } - + /** * DidYouMean's producer thread that changes one letter (e.g. bat/cat) for a given term * based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.

* Note: the loop runs (alphabet.length * len) tests. */ public class ChangingOneLetter extends Thread { - @Override public void run() { char m; for (int i = 0; i < DidYouMean.this.wordLen; i++) { - try { - m = DidYouMean.this.word.charAt(i); - for (final char c: DidYouMean.this.alphabet) { - if (m != c) { - final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i + 1)); - test(ts); - } - if (System.currentTimeMillis() > DidYouMean.this.timeLimit) { - return; - } + m = DidYouMean.this.word.charAt(i); + for (final char c: DidYouMean.this.alphabet) { + if (m != c) { + final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i + 1)); + test(ts); } - } catch (final InterruptedException e) {} + if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return; + } } } } @@ -425,20 +365,14 @@ public class DidYouMean { * Note: the loop runs (len) tests. */ private class DeletingOneLetter extends Thread { - - @Override - public void run() { - for (int i = 0; i < DidYouMean.this.wordLen; i++) { - try { - final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.substring(i + 1)); - test(ts); - if (System.currentTimeMillis() > DidYouMean.this.timeLimit) { - return; - } - } catch (final InterruptedException e) {} - } + @Override + public void run() { + for (int i = 0; i < DidYouMean.this.wordLen; i++) { + final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.substring(i + 1)); + test(ts); + if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return; } - + } } /** @@ -447,21 +381,16 @@ public class DidYouMean { * Note: the loop runs (alphabet.length * len) tests. */ private class AddingOneLetter extends Thread { - - @Override - public void run() { - for (int i = 0; i <= DidYouMean.this.wordLen; i++) { - try { - for (final char c: DidYouMean.this.alphabet) { - final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i)); - test(ts); - if (System.currentTimeMillis() > DidYouMean.this.timeLimit) { - return; - } - } - } catch (final InterruptedException e) {} + @Override + public void run() { + for (int i = 0; i <= DidYouMean.this.wordLen; i++) { + for (final char c: DidYouMean.this.alphabet) { + final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i)); + test(ts); + if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return; } } + } } /** @@ -470,68 +399,21 @@ public class DidYouMean { * Note: the loop runs (len-1) tests. */ private class ReversingTwoConsecutiveLetters extends Thread { - - @Override - public void run() { - for (int i = 0; i < DidYouMean.this.wordLen - 1; i++) { - try { - final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.charAt(i + 1)).append(DidYouMean.this.word.charAt(i)).append(DidYouMean.this.word.substring(i + 2)); - test(ts); - if (System.currentTimeMillis() > DidYouMean.this.timeLimit) { - return; - } - } catch (final InterruptedException e) {} - } - } - - } - - /** - * DidYouMean's consumer thread takes a String object (term) from the blocking queue - * and checks if it is contained in YaCy's RWI index. - * Note: this causes no or moderate i/o as it uses the efficient index.has() method. - */ - private class Consumer extends Thread { - - @Override - public void run() { - StringBuilder s; - try { - while ((s = DidYouMean.this.guesses.take()) != POISON_STRING) { - if (s.length() >= MinimumOutputWordLength) { - DidYouMean.this.resultSet.add(s); - } - if (System.currentTimeMillis() > DidYouMean.this.timeLimit) { - return; - } - } - } catch (final InterruptedException e) {} - } - } - - /** - * indexSizeComparator is used by DidYouMean to order terms by index.count() - * Warning: this causes heavy i/o - */ - private class indexSizeComparator implements Comparator { - @Override - public int compare(final StringBuilder o1, final StringBuilder o2) { - final int i1 = DidYouMean.this.segment.getWordCountGuess(o1.toString()); - final int i2 = DidYouMean.this.segment.getWordCountGuess(o2.toString()); - if (i1 == i2) { - return WORD_LENGTH_COMPARATOR.compare(o1, o2); + public void run() { + for (int i = 0; i < DidYouMean.this.wordLen - 1; i++) { + final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.charAt(i + 1)).append(DidYouMean.this.word.charAt(i)).append(DidYouMean.this.word.substring(i + 2)); + test(ts); + if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return; } - return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result } - } + } /** * wordLengthComparator is used by DidYouMean to order terms by the term length * This is the default order if the indexSizeComparator is not used */ private static class wordLengthComparator implements Comparator { - @Override public int compare(final StringBuilder o1, final StringBuilder o2) { final int i1 = o1.length(); @@ -541,7 +423,6 @@ public class DidYouMean { } return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first } - } /**