You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
510 lines
25 KiB
510 lines
25 KiB
package net.yacy.data;
|
|
|
|
import java.io.IOException;
|
|
import java.util.Collection;
|
|
import java.util.Collections;
|
|
import java.util.Comparator;
|
|
import java.util.ConcurrentModificationException;
|
|
import java.util.Iterator;
|
|
import java.util.LinkedHashSet;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.SortedSet;
|
|
import java.util.TreeSet;
|
|
|
|
import org.apache.solr.client.solrj.SolrQuery;
|
|
import org.apache.solr.client.solrj.response.QueryResponse;
|
|
import org.apache.solr.common.SolrException;
|
|
|
|
import net.yacy.cora.sorting.ClusteredScoreMap;
|
|
import net.yacy.cora.sorting.OrderedScoreMap;
|
|
import net.yacy.cora.sorting.ReversibleScoreMap;
|
|
import net.yacy.cora.util.CommonPattern;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.cora.util.StringBuilderComparator;
|
|
import net.yacy.document.LibraryProvider;
|
|
import net.yacy.search.index.Segment;
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
|
|
|
|
/**
|
|
* People make mistakes when they type words.
|
|
* The most common mistakes are the four categories listed below:
|
|
* <ol>
|
|
* <li>Changing one letter: bat / cat;</li>
|
|
* <li>Adding one letter: bat / boat;</li>
|
|
* <li>Deleting one letter: frog / fog; or</li>
|
|
* <li>Reversing two consecutive letters: two / tow.</li>
|
|
* </ol>
|
|
* DidYouMean provides producer threads, that feed a blocking queue with word variations according to
|
|
* the above mentioned four categories. Consumer threads check then the generated word variations against a term index.
|
|
* Only words contained in the term index are return by the getSuggestion method.<p/>
|
|
* @author apfelmaennchen
|
|
* @author orbiter (extensions for multi-language support + multi-word suggestions)
|
|
*/
|
|
public class DidYouMean {
|
|
|
|
private static final int MinimumInputWordLength = 2;
|
|
private static final int MinimumOutputWordLength = 4;
|
|
|
|
private static final char[] ALPHABET_LATIN = {
|
|
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
|
|
'q','r','s','t','u','v','w','x','y','z',
|
|
'\u00df',
|
|
'\u00e0','\u00e1','\u00e2','\u00e3','\u00e4','\u00e5','\u00e6','\u00e7',
|
|
'\u00e8','\u00e9','\u00ea','\u00eb','\u00ec','\u00ed','\u00ee','\u00ef',
|
|
'\u00f0','\u00f1','\u00f2','\u00f3','\u00f4','\u00f5','\u00f6',
|
|
'\u00f8','\u00f9','\u00fa','\u00fb','\u00fc','\u00fd','\u00fe','\u00ff'};
|
|
private static final char[] ALPHABET_KANJI = new char[512]; // \u3400-\u34ff + \u4e00-\u4eff
|
|
private static final char[] ALPHABET_HIRAGANA = new char[96]; // \u3040-\u309F
|
|
private static final char[] ALPHABET_KATAKANA = new char[96]; // \u30A0-\u30FF
|
|
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1 = new char[5376]; // \u4E00-\u62FF
|
|
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2 = new char[5376]; // \u6300-\u77FF
|
|
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3 = new char[5376]; // \u7800-\u8CFF
|
|
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4 = new char[4864]; // \u8D00-\u9FFF
|
|
static {
|
|
// this is very experimental: a very small subset of Kanji
|
|
for (char a = '\u3400'; a <= '\u34ff'; a++) ALPHABET_KANJI[0xff & (a - '\u3400')] = a;
|
|
for (char a = '\u4e00'; a <= '\u4eff'; a++) ALPHABET_KANJI[0xff & (a - '\u4e00') + 256] = a;
|
|
for (char a = '\u3040'; a <= '\u309F'; a++) ALPHABET_HIRAGANA[0xff & (a - '\u3040')] = a;
|
|
for (char a = '\u30A0'; a <= '\u30FF'; a++) ALPHABET_KATAKANA[0xff & (a - '\u30A0')] = a;
|
|
for (char a = '\u4E00'; a <= '\u62FF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1[0xff & (a - '\u4E00')] = a;
|
|
for (char a = '\u6300'; a <= '\u77FF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2[0xff & (a - '\u6300')] = a;
|
|
for (char a = '\u7800'; a <= '\u8CFF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3[0xff & (a - '\u7800')] = a;
|
|
for (char a = '\u8D00'; a <= '\u9FFF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4[0xff & (a - '\u8D00')] = a;
|
|
}
|
|
|
|
private static final char[][] ALPHABETS = {
|
|
ALPHABET_LATIN, ALPHABET_KANJI, ALPHABET_HIRAGANA, ALPHABET_KATAKANA,
|
|
ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4};
|
|
public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors();
|
|
private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator();
|
|
|
|
private final Segment segment;
|
|
private final StringBuilder word;
|
|
private final boolean endsWithSpace;
|
|
private final int wordLen;
|
|
private long timeLimit;
|
|
private final SortedSet<StringBuilder> resultSet;
|
|
private char[] alphabet;
|
|
private boolean more;
|
|
|
|
/**
|
|
* @param index a termIndex - most likely retrieved from a switchboard object.
|
|
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
|
|
*/
|
|
public DidYouMean(final Segment segment, final String word0) {
|
|
this.endsWithSpace = word0.length() > 0 && word0.charAt(word0.length() - 1) == ' ';
|
|
this.word = new StringBuilder(word0.trim());
|
|
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<StringBuilder>(new headMatchingComparator(this.word, WORD_LENGTH_COMPARATOR)));
|
|
this.wordLen = this.word.length();
|
|
this.segment = segment;
|
|
this.more = segment.connectedRWI() && segment.RWICount() > 0; // with RWIs connected the guessing is super-fast
|
|
|
|
// identify language
|
|
if (this.word.length() > 0) {
|
|
char testchar = this.word.charAt(0);
|
|
if (testchar >= 'A' && testchar <= 'Z') testchar = (char) (testchar + 32);
|
|
boolean alphafound = false;
|
|
alphatest: for (final char[] alpha: ALPHABETS) {
|
|
if (isAlphabet(alpha, testchar)) {
|
|
this.alphabet = new char[alpha.length];
|
|
System.arraycopy(alpha, 0, this.alphabet, 0, alpha.length);
|
|
alphafound = true;
|
|
break alphatest;
|
|
}
|
|
}
|
|
if (!alphafound && testchar < 'A') {
|
|
this.alphabet = new char[ALPHABET_LATIN.length];
|
|
System.arraycopy(ALPHABET_LATIN, 0, this.alphabet, 0, ALPHABET_LATIN.length);
|
|
alphafound = true;
|
|
}
|
|
if (!alphafound) {
|
|
// generate generic alphabet using simply a character block of 256 characters
|
|
final int firstchar = (0xff & (testchar / 256)) * 256;
|
|
final int lastchar = firstchar + 255;
|
|
this.alphabet = new char[256];
|
|
// test this with /suggest.json?q=%EF%BD%84
|
|
for (int a = firstchar; a <= lastchar; a++) {
|
|
this.alphabet[0xff & (a - firstchar)] = (char) a;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private static final boolean isAlphabet(final char[] alpha, final char testchar) {
|
|
for (final char a: alpha) {
|
|
if (a == testchar) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
public void reset() {
|
|
this.resultSet.clear();
|
|
}
|
|
|
|
/**
|
|
* get suggestions for a given word. The result is first ordered using a term size ordering,
|
|
* and a subset of the result is sorted again with a IO-intensive order based on the index size
|
|
* @param word0
|
|
* @param timeout
|
|
* @param preSortSelection the number of words that participate in the IO-intensive sort
|
|
* @return
|
|
*/
|
|
public Collection<StringBuilder> getSuggestions(final long timeout, final int preSortSelection, boolean askIndex) {
|
|
if (this.word.length() < MinimumInputWordLength) {
|
|
return this.resultSet; // return nothing if input is too short
|
|
}
|
|
final long startTime = System.currentTimeMillis();
|
|
final long timelimit = startTime + timeout;
|
|
int lastIndexOfSpace = this.word.lastIndexOf(" ");
|
|
final Collection<StringBuilder> preSorted;
|
|
if (askIndex && lastIndexOfSpace > 0) {
|
|
// several words
|
|
preSorted = getSuggestions(this.word.substring(0, lastIndexOfSpace), this.word.substring(lastIndexOfSpace + 1), timeout, preSortSelection, this.segment);
|
|
} else {
|
|
if (this.endsWithSpace) {
|
|
preSorted = getSuggestions(this.word.toString(), "", timeout, preSortSelection, this.segment);
|
|
} else {
|
|
preSorted = getSuggestions(timeout, askIndex);
|
|
}
|
|
}
|
|
final ReversibleScoreMap<StringBuilder> scored = new ClusteredScoreMap<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
|
|
LinkedHashSet<StringBuilder> countSorted = new LinkedHashSet<StringBuilder>();
|
|
if (this.more) {
|
|
final int wc = this.segment.getWordCountGuess(this.word.toString()); // all counts must be greater than this
|
|
try {
|
|
for (final StringBuilder s: preSorted) {
|
|
if (System.currentTimeMillis() > timelimit) break;
|
|
if (!(scored.sizeSmaller(2 * preSortSelection))) break;
|
|
String s0 = s.toString();
|
|
int wcg = s0.indexOf(' ') > 0 ? s0.length() * 100 : this.segment.getWordCountGuess(s0);
|
|
if (wcg > wc) scored.inc(s, wcg);
|
|
}
|
|
} catch (final ConcurrentModificationException e) {
|
|
}
|
|
Iterator<StringBuilder> i = scored.keys(false);
|
|
while (i.hasNext()) countSorted.add(i.next());
|
|
} else {
|
|
try {
|
|
for (final StringBuilder s: preSorted) {
|
|
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(s, this.word) ||
|
|
StringBuilderComparator.CASE_INSENSITIVE_ORDER.endsWith(this.word, s)) countSorted.add(this.word);
|
|
}
|
|
for (final StringBuilder s: preSorted) {
|
|
if (!StringBuilderComparator.CASE_INSENSITIVE_ORDER.equals(s, this.word)) countSorted.add(s);
|
|
}
|
|
} catch (final ConcurrentModificationException e) {
|
|
}
|
|
}
|
|
|
|
// finished
|
|
ConcurrentLog.info("DidYouMean", "found " + preSorted.size() + " unsorted terms, returned " + countSorted.size() + " sorted suggestions; execution time: "
|
|
+ (System.currentTimeMillis() - startTime) + "ms");
|
|
|
|
return countSorted;
|
|
}
|
|
|
|
/**
|
|
* return a string that is a suggestion list for the list of given words
|
|
* @param head - the sequence of words before the last space in the sequence, fixed (not to be corrected); possibly empty
|
|
* @param tail - the word after the last space, possibly empty or misspelled
|
|
* @param timeout for operation
|
|
* @param preSortSelection - number of suggestions to be computed
|
|
* @return
|
|
*/
|
|
private static Collection<StringBuilder> getSuggestions(final String head, final String tail, final long timeout, final int preSortSelection, final Segment segment) {
|
|
final SortedSet<StringBuilder> result = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
|
|
int count = 30;
|
|
final SolrQuery solrQuery = new SolrQuery();
|
|
solrQuery.setParam("defType", "edismax");
|
|
solrQuery.setFacet(false);
|
|
String q = "", fq = "";
|
|
if (head.length() == 0 && tail.length() > 0) {
|
|
// head == "", tail != "" -> only one word was entered, no space at end
|
|
q = CollectionSchema.title.getSolrFieldName() + ":\"" + tail + "\"^1000.0 " + CollectionSchema.text_t.getSolrFieldName() + ":" + tail + "~";
|
|
fq = null;
|
|
}
|
|
if (head.length() > 0 && tail.length() == 0) {
|
|
// head != "", tail == "" -> only one word was entered and ends on space
|
|
q = CollectionSchema.title.getSolrFieldName() + ":\"" + head + " \"^1000.0 " + CollectionSchema.text_t.getSolrFieldName() + ":\"" + head + " \"";
|
|
fq = CollectionSchema.text_t.getSolrFieldName() + ":\"" + head + " \"";
|
|
}
|
|
if (head.length() > 0 && tail.length() > 0) {
|
|
// head != "", tail != "" -> several words were entered, last one is in tail, everything before in head.
|
|
q = CollectionSchema.text_t.getSolrFieldName() + ":(" + head + " " + tail + ")~"; // for a fuzzy search we cannot apply fuzzyness on the tail only
|
|
fq = CollectionSchema.text_t.getSolrFieldName() + ":\"" + head + "\"";
|
|
}
|
|
solrQuery.setQuery(q);
|
|
if (head.length() > 0 && fq != null) solrQuery.setFilterQueries(fq);
|
|
solrQuery.setStart(0);
|
|
solrQuery.setRows(count);
|
|
solrQuery.setHighlight(true);
|
|
//solrQuery.setHighlightFragsize(head.length() + tail.length() + 180);
|
|
solrQuery.setHighlightSimplePre("<b>");
|
|
solrQuery.setHighlightSimplePost("</b>");
|
|
solrQuery.setHighlightSnippets(5);
|
|
//solrQuery.addHighlightField(CollectionSchema.title.getSolrFieldName());
|
|
solrQuery.addHighlightField(CollectionSchema.text_t.getSolrFieldName());
|
|
solrQuery.setFields(); // no fields wanted! only snippets
|
|
OrderedScoreMap<String> snippets = new OrderedScoreMap<String>(null);
|
|
try {
|
|
QueryResponse response = segment.fulltext().getDefaultConnector().getResponseByParams(solrQuery);
|
|
|
|
/*
|
|
SolrQuery query = new SolrQuery();
|
|
query.setRequestHandler("/suggest");
|
|
//query.setQueryType(suggestHandler);
|
|
query.setQuery((head + " " + tail).trim());
|
|
Map<String,String> params = new HashMap<String,String>();
|
|
params.put(CommonParams.ROWS,Integer.toString(count));
|
|
params.put(SpellingParams.SPELLCHECK_PREFIX + "field",dictionary);
|
|
params.put(SpellingParams.SPELLCHECK_PREFIX + "dictionary",dictionary);
|
|
params.put(SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR,Boolean.toString(onlyMorePopular));
|
|
params.put(SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES,Integer.toString(1));
|
|
params.put(SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS,Boolean.toString(collate));
|
|
params.put(SpellingParams.SPELLCHECK_COLLATE,Boolean.toString(collate));
|
|
query.add(new MapSolrParams(params));
|
|
response = segment.fulltext().getDefaultConnector().getResponseByParams(query);
|
|
|
|
SpellCheckResponse spellCheckResponse = response.getSpellCheckResponse();
|
|
if (spellCheckResponse != null) {
|
|
Map<String,Suggestion> suggestionMapInternal = spellCheckResponse.getSuggestionMap();
|
|
if (suggestionMapInternal != null) {
|
|
Map<String, Suggestion> suggestionMap = spellCheckResponse.getSuggestionMap();
|
|
}
|
|
if (spellCheckResponse.getCollatedResult() != null) {
|
|
String collatedResult = spellCheckResponse.getCollatedResult().trim();
|
|
}
|
|
List<Suggestion> suggestions=spellCheckResponse.getSuggestions();
|
|
if (suggestions.size() != 0) {
|
|
StringBuffer sb=new StringBuffer();
|
|
for (Suggestion suggestion : suggestions) {
|
|
sb.append(suggestion.getSuggestions().get(0)).append(" ");
|
|
}
|
|
String spellCheckProposal = sb.toString().trim();
|
|
}
|
|
}
|
|
*/
|
|
|
|
Map<String, Map<String, List<String>>> rawsnippets = response.getHighlighting(); // a map from the urlhash to a map with key=field and value = list of snippets
|
|
if (rawsnippets != null) {
|
|
for (Map<String, List<String>> re: rawsnippets.values()) {
|
|
for (List<String> sl: re.values()) {
|
|
for (String s: sl) {
|
|
// the suggestion for the tail is in the snippet
|
|
s = s.replaceAll("</b> <b>", " ");
|
|
int snippetOpen = s.indexOf("<b>");
|
|
int snippetClose = s.indexOf("</b>");
|
|
if (snippetOpen >= 0 && snippetClose > snippetOpen) {
|
|
String snippet = s.substring(snippetOpen + 3, snippetClose);
|
|
String afterSnippet = s.substring(snippetClose + 4).trim();
|
|
s = snippet + (afterSnippet.length() > 0 ? " " + afterSnippet : "");
|
|
for (int i = 0; i < s.length(); i++) {char c = s.charAt(i); if (c < 'A') s = s.replace(c, ' ');} // remove funny symbols
|
|
s = s.replaceAll("<b>", " ").replaceAll("</b>", " ").replaceAll(" ", " ").trim(); // wipe superfluous whitespace
|
|
String[] sx = CommonPattern.SPACES.split(s);
|
|
StringBuilder sb = new StringBuilder(s.length());
|
|
for (String x: sx) if (x.length() > 1 && sb.length() < 28) sb.append(x).append(' '); else break;
|
|
s = sb.toString().trim();
|
|
if (s.length() > 0) snippets.inc(s, count--);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (SolrException e) {
|
|
e.printStackTrace();
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
// delete all snippets which occur double-times, i.e. one that is a substring of another: remove longer snippet
|
|
Iterator<String> si = snippets.keys(false);
|
|
while (si.hasNext()) {
|
|
String testsnippet = si.next().toLowerCase();
|
|
if (testsnippet.length() > head.length() + tail.length() + 1) {
|
|
Iterator<String> sin = snippets.keys(false);
|
|
while (sin.hasNext()) {
|
|
String snippetx = sin.next();
|
|
if (snippetx.length() != testsnippet.length() && snippetx.toLowerCase().startsWith(testsnippet)) {
|
|
snippets.delete(snippetx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
si = snippets.keys(false);
|
|
while (si.hasNext() && result.size() < preSortSelection) {
|
|
result.add(new StringBuilder(si.next()));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* This method triggers the producer and consumer threads of the DidYouMean object.
|
|
* @param word a String with a single word
|
|
* @param timeout execution time in ms.
|
|
* @return a Set<String> with word variations contained in term index.
|
|
*/
|
|
private Collection<StringBuilder> getSuggestions(final long timeout, boolean askIndex) {
|
|
final long startTime = System.currentTimeMillis();
|
|
this.timeLimit = startTime + timeout;
|
|
|
|
Thread[] producers = null;
|
|
if (this.more) {
|
|
// create and start producers
|
|
// the CPU load to create the guessed words is very low, but the testing
|
|
// against the library may be CPU intensive. Since it is possible to test
|
|
// words in the library concurrently, it is a good idea to start separate threads
|
|
producers = new Thread[4];
|
|
producers[0] = new ChangingOneLetter();
|
|
producers[1] = new AddingOneLetter();
|
|
producers[2] = new DeletingOneLetter();
|
|
producers[3] = new ReversingTwoConsecutiveLetters();
|
|
for (final Thread t: producers) {
|
|
t.start();
|
|
}
|
|
}
|
|
|
|
test(this.word);
|
|
if (askIndex) this.resultSet.addAll(getSuggestions("", this.word.toString(), timeout, 10, this.segment));
|
|
|
|
if (this.more) {
|
|
// finish the producer
|
|
for (final Thread t: producers) {
|
|
long wait = this.timeLimit - System.currentTimeMillis();
|
|
if (wait > 0) try {
|
|
t.join(wait);
|
|
} catch (final InterruptedException e) {}
|
|
}
|
|
}
|
|
|
|
// we don't want the given word in the result
|
|
this.resultSet.remove(this.word);
|
|
return this.resultSet;
|
|
}
|
|
|
|
private void test(final StringBuilder s) {
|
|
final Set<StringBuilder> libr = LibraryProvider.dymLib.recommend(s);
|
|
libr.addAll(LibraryProvider.geoLoc.recommend(s));
|
|
for (final StringBuilder t: libr) {
|
|
if (t.length() >= MinimumOutputWordLength) this.resultSet.add(t);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* DidYouMean's producer thread that changes one letter (e.g. bat/cat) for a given term
|
|
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
|
|
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
|
|
*/
|
|
public class ChangingOneLetter extends Thread {
|
|
@Override
|
|
public void run() {
|
|
char m;
|
|
for (int i = 0; i < DidYouMean.this.wordLen; i++) {
|
|
m = DidYouMean.this.word.charAt(i);
|
|
for (final char c: DidYouMean.this.alphabet) {
|
|
if (m != c) {
|
|
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i + 1));
|
|
test(ts);
|
|
}
|
|
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term
|
|
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
|
|
* <b>Note:</b> the loop runs (len) tests.
|
|
*/
|
|
private class DeletingOneLetter extends Thread {
|
|
@Override
|
|
public void run() {
|
|
for (int i = 0; i < DidYouMean.this.wordLen; i++) {
|
|
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.substring(i + 1));
|
|
test(ts);
|
|
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term
|
|
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
|
|
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
|
|
*/
|
|
private class AddingOneLetter extends Thread {
|
|
@Override
|
|
public void run() {
|
|
for (int i = 0; i <= DidYouMean.this.wordLen; i++) {
|
|
for (final char c: DidYouMean.this.alphabet) {
|
|
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i));
|
|
test(ts);
|
|
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term
|
|
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
|
|
* <b>Note:</b> the loop runs (len-1) tests.
|
|
*/
|
|
private class ReversingTwoConsecutiveLetters extends Thread {
|
|
@Override
|
|
public void run() {
|
|
for (int i = 0; i < DidYouMean.this.wordLen - 1; i++) {
|
|
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.charAt(i + 1)).append(DidYouMean.this.word.charAt(i)).append(DidYouMean.this.word.substring(i + 2));
|
|
test(ts);
|
|
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* wordLengthComparator is used by DidYouMean to order terms by the term length
|
|
* This is the default order if the indexSizeComparator is not used
|
|
*/
|
|
private static class wordLengthComparator implements Comparator<StringBuilder> {
|
|
@Override
|
|
public int compare(final StringBuilder o1, final StringBuilder o2) {
|
|
final int i1 = o1.length();
|
|
final int i2 = o2.length();
|
|
if (i1 == i2) {
|
|
return StringBuilderComparator.CASE_INSENSITIVE_ORDER.compare(o1, o2);
|
|
}
|
|
return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first
|
|
}
|
|
}
|
|
|
|
/**
|
|
* headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first
|
|
*/
|
|
private static class headMatchingComparator implements Comparator<StringBuilder> {
|
|
private final StringBuilder head;
|
|
private final Comparator<StringBuilder> secondaryComparator;
|
|
public headMatchingComparator(final StringBuilder head, final Comparator<StringBuilder> secondaryComparator) {
|
|
this.head = head;
|
|
this.secondaryComparator = secondaryComparator;
|
|
}
|
|
|
|
@Override
|
|
public int compare(final StringBuilder o1, final StringBuilder o2) {
|
|
final boolean o1m = StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(o1, this.head);
|
|
final boolean o2m = StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(o2, this.head);
|
|
if ((o1m && o2m) || (!o1m && !o2m)) {
|
|
return this.secondaryComparator.compare(o1, o2);
|
|
}
|
|
return o1m ? -1 : 1;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|