autocomplete and did-you-mean can now understand _all_ languages and can generate suggestions in all languages and character types

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7242 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent b7acd92ce4
commit a59c885ee0

@ -14,7 +14,7 @@
<script type="text/javascript" src="/yacy/ui/js/jquery.autocomplete.js"></script>
<script type="text/javascript">
$(document).ready(function() {
$('#search').autocomplete('/suggest.json', {parse: opensearch});
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false});
function opensearch(data) {
var parsed = [];
data = eval('({"suggest":' + data + '})');

@ -37,7 +37,7 @@ import de.anomic.server.serverSwitch;
*/
public class suggest {
private static final int meanMax = 10;
private static final int meanMax = 30;
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
@ -59,8 +59,8 @@ public class suggest {
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex());
Iterator<String> meanIt = didYouMean.getSuggestions(querystring, 300, 10).iterator();
DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), querystring);
Iterator<String> meanIt = didYouMean.getSuggestions(300, 10).iterator();
int meanCount = 0;
String suggestion;
StringBuilder suggestions = new StringBuilder(120);

@ -43,6 +43,6 @@
}
.ac_over {
background-color: #0A246A;
background-color: #444444;
color: white;
}

@ -42,7 +42,7 @@
<script type="text/javascript" src="/yacy/ui/js/jquery.autocomplete.js"></script>
<script type="text/javascript">
$(document).ready(function() {
$('#search').autocomplete('/suggest.json', {parse: opensearch});
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false});
function opensearch(data) {
var parsed = [];
data = eval('({"suggest":' + data + '})');

@ -545,8 +545,8 @@ public class yacysearch {
}
prop.put("meanCount", meanMax);
if (meanMax > 0) {
DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex());
Iterator<String> meanIt = didYouMean.getSuggestions(querystring, 300, 10).iterator();
DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), querystring);
Iterator<String> meanIt = didYouMean.getSuggestions(100, 10).iterator();
int meanCount = 0;
String suggestion;
while(meanCount<meanMax && meanIt.hasNext()) {

@ -26,83 +26,103 @@ import net.yacy.kelondro.rwi.IndexCell;
* the above mentioned four categories. Consumer threads check then the generated word variations against a term index.
* Only words contained in the term index are return by the getSuggestion method.<p/>
* @author apfelmaennchen
* @author orbiter (extensions for multi-language support)
*/
public class DidYouMean {
protected static final char[] ALPHABET = {
private static final char[] ALPHABET_LATIN = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'};
'q','r','s','t','u','v','w','x','y','z',
'\u00df',
'\u00e0','\u00e1','\u00e2','\u00e3','\u00e4','\u00e5','\u00e6','\u00e7',
'\u00e8','\u00e9','\u00ea','\u00eb','\u00ec','\u00ed','\u00ee','\u00ef',
'\u00f0','\u00f1','\u00f2','\u00f3','\u00f4','\u00f5','\u00f6',
'\u00f8','\u00f9','\u00fa','\u00fb','\u00fc','\u00fd','\u00fe','\u00ff'};
private static final char[] ALPHABET_KANJI = new char[512];
static {
// this is very experimental: a very small subset of Kanji
for (char a = '\u3400'; a <= '\u34ff'; a++) ALPHABET_KANJI[0xff & (a - '\u3400')] = a;
for (char a = '\u4e00'; a <= '\u4eff'; a++) ALPHABET_KANJI[0xff & (a - '\u4e00') + 256] = a;
}
private static final char[][] ALPHABETS = {ALPHABET_LATIN, ALPHABET_KANJI};
private static final String POISON_STRING = "\n";
public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors();
protected static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator();
private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator();
protected final IndexCell<WordReference> index;
protected String word;
protected int wordLen;
protected LinkedBlockingQueue<String> guessGen, guessLib;
protected long timeLimit;
protected boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written
protected final SortedSet<String> resultSet;
private final IndexCell<WordReference> index;
private char[] alphabet;
private final String word;
private final int wordLen;
private final LinkedBlockingQueue<String> guessGen, guessLib;
private long timeLimit;
private boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written
private final SortedSet<String> resultSet;
/**
* @param index a termIndex - most likely retrieved from a switchboard object.
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/
public DidYouMean(final IndexCell<WordReference> index) {
public DidYouMean(final IndexCell<WordReference> index, String word0) {
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(WORD_LENGTH_COMPARATOR));
this.word = "";
this.wordLen = 0;
this.word = word0.toLowerCase();
this.wordLen = word.length();
this.index = index;
this.guessGen = new LinkedBlockingQueue<String>();
this.guessLib = new LinkedBlockingQueue<String>();
this.createGen = true;
// identify language
if (this.word.length() == 0) {
this.alphabet = ALPHABET_LATIN;
} else {
char testchar = this.word.charAt(0);
this.alphabet = null;
alphatest: for (char[] alpha: ALPHABETS) {
if (isAlphabet(alpha, testchar)) {
this.alphabet = alpha;
break alphatest;
}
}
if (this.alphabet == null) {
// generate generic alphabet using simply a character block of 256 characters
char firstchar = (char) ((0xff & (testchar / 256)) * 256);
char lastchar = (char) (firstchar + 255);
this.alphabet = new char[256];
for (char a = firstchar; a <= lastchar; a++) {
this.alphabet[0xff & (a - firstchar)] = a;
}
}
}
}
private static final boolean isAlphabet(char[] alpha, char testchar) {
for (char a: alpha) if (a == testchar) return true;
return false;
}
public void reset() {
this.resultSet.clear();
this.guessGen.clear();
this.guessLib.clear();
}
/**
* get a single suggestion
* @param word
* @param timeout
* @return
*/
public String getSuggestion(final String word, long timeout) {
Set<String> s = getSuggestions(word, timeout);
return (s == null || s.isEmpty()) ? null : s.iterator().next();
}
/**
* get a single suggestion with additional sort
* @param word
* @param timeout
* @return
*/
public String getSuggestion(final String word, long timeout, int preSortSelection) {
Set<String> s = getSuggestions(word, timeout, preSortSelection);
return (s == null || s.isEmpty()) ? null : s.iterator().next();
}
/**
* get suggestions for a given word. The result is first ordered using a term size ordering,
* and a subset of the result is sorted again with a IO-intensive order based on the index size
* @param word
* @param word0
* @param timeout
* @param preSortSelection the number of words that participate in the IO-intensive sort
* @return
*/
public SortedSet<String> getSuggestions(final String word, long timeout, int preSortSelection) {
if (word.indexOf(' ') > 0) return getSuggestions(word.split(" "), timeout, preSortSelection, this.index);
long startTime = System.currentTimeMillis();
SortedSet<String> preSorted = getSuggestions(word, timeout);
long timelimit = 2 * System.currentTimeMillis() - startTime + timeout;
public SortedSet<String> getSuggestions(long timeout, int preSortSelection) {
if (this.word.indexOf(' ') > 0) return getSuggestions(this.word.split(" "), timeout, preSortSelection, this.index);
long timelimit = System.currentTimeMillis() + timeout;
SortedSet<String> preSorted = getSuggestions(timeout);
if (System.currentTimeMillis() > timelimit) return preSorted;
SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
int wc = index.count(Word.word2hash(word)); // all counts must be greater than this
int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this
int c0;
for (final String s: preSorted) {
if (System.currentTimeMillis() > timelimit) break;
@ -122,10 +142,10 @@ public class DidYouMean {
* @return
*/
@SuppressWarnings("unchecked")
public static SortedSet<String> getSuggestions(final String[] words, long timeout, int preSortSelection, final IndexCell<WordReference> index) {
private static SortedSet<String> getSuggestions(final String[] words, long timeout, int preSortSelection, final IndexCell<WordReference> index) {
final SortedSet<String>[] s = new SortedSet[words.length];
for (int i = 0; i < words.length; i++) {
s[i] = new DidYouMean(index).getSuggestions(words[i], timeout / words.length, preSortSelection);
s[i] = new DidYouMean(index, words[i]).getSuggestions(timeout / words.length, preSortSelection);
}
// make all permutations
final SortedSet<String> result = new TreeSet<String>();
@ -148,12 +168,10 @@ public class DidYouMean {
* @param timeout execution time in ms.
* @return a Set&lt;String&gt; with word variations contained in term index.
*/
public SortedSet<String> getSuggestions(final String word, long timeout) {
private SortedSet<String> getSuggestions(long timeout) {
long startTime = System.currentTimeMillis();
this.timeLimit = startTime + timeout;
this.word = word.toLowerCase();
this.wordLen = word.length();
// create one consumer thread that checks the guessLib queue
// for occurrences in the index. If the producers are started next, their
// results can be consumers directly
@ -162,9 +180,9 @@ public class DidYouMean {
consumers[0].start();
// get a single recommendation for the word without altering the word
Set<String> libr = LibraryProvider.dymLib.recommend(word);
Set<String> libr = LibraryProvider.dymLib.recommend(this.word);
for (final String t: libr) {
if (!t.equals(word)) try {
if (!t.equals(this.word)) try {
createGen = false;
guessLib.put(t);
} catch (InterruptedException e) {}
@ -209,7 +227,7 @@ public class DidYouMean {
try { c.join(); } catch (InterruptedException e) {}
// we don't want the given word in the result
this.resultSet.remove(word.toLowerCase());
this.resultSet.remove(this.word);
// finished
Log.logInfo("DidYouMean", "found "+this.resultSet.size()+" terms; execution time: "
@ -219,7 +237,7 @@ public class DidYouMean {
}
public void test(final String s) throws InterruptedException {
private void test(final String s) throws InterruptedException {
Set<String> libr = LibraryProvider.dymLib.recommend(s);
libr.addAll(LibraryProvider.geoLoc.recommend(s));
if (!libr.isEmpty()) createGen = false;
@ -241,7 +259,7 @@ public class DidYouMean {
@Override
public void run() {
for (int i = 0; i < wordLen; i++) try {
for (char c: ALPHABET) {
for (char c: alphabet) {
test(word.substring(0, i) + c + word.substring(i + 1));
if (System.currentTimeMillis() > timeLimit) return;
}
@ -254,7 +272,7 @@ public class DidYouMean {
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len) tests.
*/
protected class DeletingOneLetter extends Thread {
private class DeletingOneLetter extends Thread {
@Override
public void run() {
@ -271,12 +289,12 @@ public class DidYouMean {
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
protected class AddingOneLetter extends Thread {
private class AddingOneLetter extends Thread {
@Override
public void run() {
for (int i = 0; i <= wordLen; i++) try {
for (final char c: ALPHABET) {
for (final char c: alphabet) {
test(word.substring(0, i) + c + word.substring(i));
if (System.currentTimeMillis() > timeLimit) return;
}
@ -289,7 +307,7 @@ public class DidYouMean {
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len-1) tests.
*/
protected class ReversingTwoConsecutiveLetters extends Thread {
private class ReversingTwoConsecutiveLetters extends Thread {
@Override
public void run() {
@ -306,7 +324,7 @@ public class DidYouMean {
* and checks if it is contained in YaCy's RWI index.
* <b>Note:</b> this causes no or moderate i/o as it uses the efficient index.has() method.
*/
class Consumer extends Thread {
private class Consumer extends Thread {
@Override
public void run() {
@ -324,7 +342,7 @@ public class DidYouMean {
* indexSizeComparator is used by DidYouMean to order terms by index.count()<p/>
* <b>Warning:</b> this causes heavy i/o
*/
protected class indexSizeComparator implements Comparator<String> {
private class indexSizeComparator implements Comparator<String> {
public int compare(final String o1, final String o2) {
final int i1 = index.count(Word.word2hash(o1));
@ -338,7 +356,7 @@ public class DidYouMean {
* wordLengthComparator is used by DidYouMean to order terms by the term length<p/>
* This is the default order if the indexSizeComparator is not used
*/
protected static class wordLengthComparator implements Comparator<String> {
private static class wordLengthComparator implements Comparator<String> {
public int compare(final String o1, final String o2) {
final int i1 = o1.length();

Loading…
Cancel
Save