enhanced didyoumean

pull/1/head
orbiter 11 years ago
parent c0e6a65ec3
commit a11f072504

@ -22,8 +22,8 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.SortedSet;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@ -68,7 +68,7 @@ public class suggest {
int c = 0;
final DidYouMean didYouMean = new DidYouMean(sb.index, new StringBuilder(querystring));
final SortedSet<StringBuilder> suggestions = didYouMean.getSuggestions(timeout, count);
final Collection<StringBuilder> suggestions = didYouMean.getSuggestions(timeout, count);
//[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]]
synchronized (suggestions) {
for (StringBuilder suggestion: suggestions) {

@ -65,19 +65,26 @@ public class StringBuilderComparator implements Comparator<StringBuilder> {
final int l0 = sb0.length();
final int l1 = sb1.length();
if (l0 != l1) return false;
return equals(sb0, sb1, l1);
return equals(sb0, sb1, 0, l1);
}
public boolean startsWith(final StringBuilder sb0, final StringBuilder sb1) {
final int l0 = sb0.length();
final int l1 = sb1.length();
if (l0 < l1) return false;
return equals(sb0, sb1, l1);
return equals(sb0, sb1, 0, l1);
}
private boolean equals(final StringBuilder sb0, final StringBuilder sb1, final int l) {
public boolean endsWith(final StringBuilder sb0, final StringBuilder sb1) {
final int l0 = sb0.length();
final int l1 = sb1.length();
if (l0 < l1) return false;
return equals(sb0, sb1, l0 - l1, l1);
}
private boolean equals(final StringBuilder sb0, final StringBuilder sb1, int start, final int l) {
char c0, c1;
for (int i = 0; i < l; i++) {
for (int i = start; i < l; i++) {
c0 = sb0.charAt(i);
c1 = sb1.charAt(i);
if (c0 == c1) continue;

@ -1,6 +1,8 @@
package net.yacy.data;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.ConcurrentModificationException;
@ -10,7 +12,6 @@ import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
@ -76,17 +77,14 @@ public class DidYouMean {
private static final char[][] ALPHABETS = {
ALPHABET_LATIN, ALPHABET_KANJI, ALPHABET_HIRAGANA, ALPHABET_KATAKANA,
ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4};
private static final StringBuilder POISON_STRING = new StringBuilder("\n");
public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors();
private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator();
private final Segment segment;
private final StringBuilder word;
private final int wordLen;
private final LinkedBlockingQueue<StringBuilder> guesses;
private long timeLimit;
private final SortedSet<StringBuilder> resultSet;
private final indexSizeComparator INDEX_SIZE_COMPARATOR;
private char[] alphabet;
private boolean more;
@ -99,8 +97,6 @@ public class DidYouMean {
this.word = word0;
this.wordLen = this.word.length();
this.segment = segment;
this.guesses = new LinkedBlockingQueue<StringBuilder>();
this.INDEX_SIZE_COMPARATOR = new indexSizeComparator();
this.more = segment.connectedRWI() && segment.RWICount() > 0; // with RWIs connected the guessing is super-fast
// identify language
@ -144,7 +140,6 @@ public class DidYouMean {
public void reset() {
this.resultSet.clear();
this.guesses.clear();
}
/**
@ -155,7 +150,7 @@ public class DidYouMean {
* @param preSortSelection the number of words that participate in the IO-intensive sort
* @return
*/
public SortedSet<StringBuilder> getSuggestions(final long timeout, final int preSortSelection) {
public Collection<StringBuilder> getSuggestions(final long timeout, final int preSortSelection) {
if (this.word.length() < MinimumInputWordLength) {
return this.resultSet; // return nothing if input is too short
}
@ -167,47 +162,33 @@ public class DidYouMean {
return getSuggestions(this.word.substring(0, lastIndexOfSpace), this.word.substring(lastIndexOfSpace + 1), timeout, preSortSelection, this.segment);
}
final SortedSet<StringBuilder> preSorted = getSuggestions(timeout);
/*
if (System.currentTimeMillis() > timelimit) {
ConcurrentLog.info("DidYouMean", "found and returned " + preSorted.size() + " unsorted suggestions (1); execution time: "
+ (System.currentTimeMillis() - startTime) + "ms");
return preSorted;
}
*/
final ReversibleScoreMap<StringBuilder> scored = new ClusteredScoreMap<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
try {
for (final StringBuilder s: preSorted) {
if (System.currentTimeMillis() > timelimit) {
break;
}
if (!(scored.sizeSmaller(2 * preSortSelection))) {
break;
}
scored.inc(s, this.segment.getWordCountGuess(s.toString()));
}
} catch (final ConcurrentModificationException e) {
}
final SortedSet<StringBuilder> countSorted = Collections.synchronizedSortedSet(new TreeSet<StringBuilder>(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR)));
final int wc = this.segment.getWordCountGuess(this.word.toString()); // all counts must be greater than this
while (!scored.isEmpty() && countSorted.size() < preSortSelection) {
final StringBuilder s = scored.getMaxKey();
final int score = scored.delete(s);
if (s.length() >= MinimumOutputWordLength && score > wc) {
countSorted.add(s);
Collection<StringBuilder> countSorted = new ArrayList<StringBuilder>();
if (this.more) {
final int wc = this.segment.getWordCountGuess(this.word.toString()); // all counts must be greater than this
try {
for (final StringBuilder s: preSorted) {
if (System.currentTimeMillis() > timelimit) break;
if (!(scored.sizeSmaller(2 * preSortSelection))) break;
String s0 = s.toString();
int wcg = s0.indexOf(' ') > 0 ? s0.length() * 100 : this.segment.getWordCountGuess(s0);
if (wcg > wc) scored.inc(s, wcg);
}
} catch (final ConcurrentModificationException e) {
}
if (System.currentTimeMillis() > timelimit) {
break;
Iterator<StringBuilder> i = scored.keys(false);
while (i.hasNext()) countSorted.add(i.next());
} else {
try {
for (final StringBuilder s: preSorted) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(s, this.word) ||
StringBuilderComparator.CASE_INSENSITIVE_ORDER.endsWith(this.word, s)) countSorted.add(s);
}
} catch (final ConcurrentModificationException e) {
}
}
// finished
/*
if (countSorted.isEmpty()) {
ConcurrentLog.info("DidYouMean", "found and returned " + preSorted.size() + " unsorted suggestions (2); execution time: "
+ (System.currentTimeMillis() - startTime) + "ms");
return preSorted;
}
*/
ConcurrentLog.info("DidYouMean", "found " + preSorted.size() + " unsorted terms, returned " + countSorted.size() + " sorted suggestions; execution time: "
+ (System.currentTimeMillis() - startTime) + "ms");
@ -222,7 +203,7 @@ public class DidYouMean {
* @param preSortSelection
* @return
*/
private static SortedSet<StringBuilder> getSuggestions(final String head, final String tail, final long timeout, final int preSortSelection, final Segment segment) {
private static Collection<StringBuilder> getSuggestions(final String head, final String tail, final long timeout, final int preSortSelection, final Segment segment) {
final SortedSet<StringBuilder> result = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
int count = 30;
final SolrQuery solrQuery = new SolrQuery();
@ -239,7 +220,6 @@ public class DidYouMean {
solrQuery.addHighlightField(CollectionSchema.title.getSolrFieldName());
solrQuery.addHighlightField(CollectionSchema.text_t.getSolrFieldName());
solrQuery.setFields(); // no fields wanted! only snippets
//List<String> snippets = new ArrayList<String>();
OrderedScoreMap<String> snippets = new OrderedScoreMap<String>(null);
try {
QueryResponse response = segment.fulltext().getDefaultConnector().getResponseByParams(solrQuery);
@ -313,108 +293,68 @@ public class DidYouMean {
private SortedSet<StringBuilder> getSuggestions(final long timeout) {
final long startTime = System.currentTimeMillis();
this.timeLimit = startTime + timeout;
// create one consumer thread that checks the guessLib queue
// for occurrences in the index. If the producers are started next, their
// results can be consumers directly
final Consumer[] consumers = new Consumer[AVAILABLE_CPU];
consumers[0] = new Consumer();
consumers[0].start();
// get a single recommendation for the word without altering the word
final Set<StringBuilder> libr = LibraryProvider.dymLib.recommend(this.word);
for (final StringBuilder t: libr) {
if (!t.equals(this.word)) {
try {
this.guesses.put(t);
} catch (final InterruptedException e) {}
Thread[] producers = null;
if (this.more) {
// create and start producers
// the CPU load to create the guessed words is very low, but the testing
// against the library may be CPU intensive. Since it is possible to test
// words in the library concurrently, it is a good idea to start separate threads
producers = new Thread[4];
producers[0] = new ChangingOneLetter();
producers[1] = new AddingOneLetter();
producers[2] = new DeletingOneLetter();
producers[3] = new ReversingTwoConsecutiveLetters();
for (final Thread t: producers) {
t.start();
}
}
// create and start producers
// the CPU load to create the guessed words is very low, but the testing
// against the library may be CPU intensive. Since it is possible to test
// words in the library concurrently, it is a good idea to start separate threads
final Thread[] producers = new Thread[4];
producers[0] = new ChangingOneLetter();
producers[1] = new AddingOneLetter();
producers[2] = new DeletingOneLetter();
producers[3] = new ReversingTwoConsecutiveLetters();
for (final Thread t: producers) {
t.start();
}
// start more consumers if there are more cores
if (consumers.length > 1) {
for (int i = 1; i < consumers.length; i++) {
consumers[i] = new Consumer();
consumers[i].start();
test(this.word);
this.resultSet.addAll(getSuggestions(this.word.toString(), "", timeout, 10, this.segment));
if (this.more) {
// finish the producer
for (final Thread t: producers) {
long wait = this.timeLimit - System.currentTimeMillis();
if (wait > 0) try {
t.join(wait);
} catch (final InterruptedException e) {}
}
}
// now decide which kind of guess is better
// we take guessLib entries as long as there is any entry in it
// to see if this is the case, we must wait for termination of the producer
for (final Thread t: producers) {
long wait = this.timeLimit - System.currentTimeMillis();
if (wait > 0) try {
t.join(wait);
} catch (final InterruptedException e) {}
}
// put poison into guessLib to terminate consumers
for (@SuppressWarnings("unused") final Consumer c: consumers) {
try { this.guesses.put(POISON_STRING); } catch (final InterruptedException e) {}
}
// wait for termination of consumer
for (final Consumer c: consumers) {
long wait = this.timeLimit - System.currentTimeMillis();
if (wait > 0) try {
c.join(wait);
} catch (final InterruptedException e) {}
if (c.isAlive()) c.interrupt();
}
// we don't want the given word in the result
this.resultSet.remove(this.word);
return this.resultSet;
}
private void test(final StringBuilder s) throws InterruptedException {
private void test(final StringBuilder s) {
final Set<StringBuilder> libr = LibraryProvider.dymLib.recommend(s);
libr.addAll(LibraryProvider.geoLoc.recommend(s));
for (final StringBuilder t: libr) {
this.guesses.put(t);
if (t.length() >= MinimumOutputWordLength) this.resultSet.add(t);
}
this.guesses.add(s);
if (s.length() >= MinimumOutputWordLength) this.resultSet.add(s);
}
/**
* DidYouMean's producer thread that changes one letter (e.g. bat/cat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
public class ChangingOneLetter extends Thread {
@Override
public void run() {
char m;
for (int i = 0; i < DidYouMean.this.wordLen; i++) {
try {
m = DidYouMean.this.word.charAt(i);
for (final char c: DidYouMean.this.alphabet) {
if (m != c) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i + 1));
test(ts);
}
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) {
return;
}
m = DidYouMean.this.word.charAt(i);
for (final char c: DidYouMean.this.alphabet) {
if (m != c) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i + 1));
test(ts);
}
} catch (final InterruptedException e) {}
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
}
}
}
@ -425,20 +365,14 @@ public class DidYouMean {
* <b>Note:</b> the loop runs (len) tests.
*/
private class DeletingOneLetter extends Thread {
@Override
public void run() {
for (int i = 0; i < DidYouMean.this.wordLen; i++) {
try {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.substring(i + 1));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) {
return;
}
} catch (final InterruptedException e) {}
}
@Override
public void run() {
for (int i = 0; i < DidYouMean.this.wordLen; i++) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.substring(i + 1));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
}
}
/**
@ -447,21 +381,16 @@ public class DidYouMean {
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
private class AddingOneLetter extends Thread {
@Override
public void run() {
for (int i = 0; i <= DidYouMean.this.wordLen; i++) {
try {
for (final char c: DidYouMean.this.alphabet) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) {
return;
}
}
} catch (final InterruptedException e) {}
@Override
public void run() {
for (int i = 0; i <= DidYouMean.this.wordLen; i++) {
for (final char c: DidYouMean.this.alphabet) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
}
}
}
/**
@ -470,68 +399,21 @@ public class DidYouMean {
* <b>Note:</b> the loop runs (len-1) tests.
*/
private class ReversingTwoConsecutiveLetters extends Thread {
@Override
public void run() {
for (int i = 0; i < DidYouMean.this.wordLen - 1; i++) {
try {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.charAt(i + 1)).append(DidYouMean.this.word.charAt(i)).append(DidYouMean.this.word.substring(i + 2));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) {
return;
}
} catch (final InterruptedException e) {}
}
}
}
/**
* DidYouMean's consumer thread takes a String object (term) from the blocking queue
* and checks if it is contained in YaCy's RWI index.
* <b>Note:</b> this causes no or moderate i/o as it uses the efficient index.has() method.
*/
private class Consumer extends Thread {
@Override
public void run() {
StringBuilder s;
try {
while ((s = DidYouMean.this.guesses.take()) != POISON_STRING) {
if (s.length() >= MinimumOutputWordLength) {
DidYouMean.this.resultSet.add(s);
}
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) {
return;
}
}
} catch (final InterruptedException e) {}
}
}
/**
* indexSizeComparator is used by DidYouMean to order terms by index.count()
* <b>Warning:</b> this causes heavy i/o
*/
private class indexSizeComparator implements Comparator<StringBuilder> {
@Override
public int compare(final StringBuilder o1, final StringBuilder o2) {
final int i1 = DidYouMean.this.segment.getWordCountGuess(o1.toString());
final int i2 = DidYouMean.this.segment.getWordCountGuess(o2.toString());
if (i1 == i2) {
return WORD_LENGTH_COMPARATOR.compare(o1, o2);
public void run() {
for (int i = 0; i < DidYouMean.this.wordLen - 1; i++) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.charAt(i + 1)).append(DidYouMean.this.word.charAt(i)).append(DidYouMean.this.word.substring(i + 2));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result
}
}
}
/**
* wordLengthComparator is used by DidYouMean to order terms by the term length
* This is the default order if the indexSizeComparator is not used
*/
private static class wordLengthComparator implements Comparator<StringBuilder> {
@Override
public int compare(final StringBuilder o1, final StringBuilder o2) {
final int i1 = o1.length();
@ -541,7 +423,6 @@ public class DidYouMean {
}
return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first
}
}
/**

Loading…
Cancel
Save