fix for didyoumean, added also more asian alphabets

pull/1/head
Michael Peter Christen 11 years ago
parent 90c8577840
commit d328cc4a83

@ -44,18 +44,28 @@ public class DidYouMean {
'\u00e8','\u00e9','\u00ea','\u00eb','\u00ec','\u00ed','\u00ee','\u00ef',
'\u00f0','\u00f1','\u00f2','\u00f3','\u00f4','\u00f5','\u00f6',
'\u00f8','\u00f9','\u00fa','\u00fb','\u00fc','\u00fd','\u00fe','\u00ff'};
private static final char[] ALPHABET_KANJI = new char[512];
private static final char[] ALPHABET_KANJI = new char[512]; // \u3400-\u34ff + \u4e00-\u4eff
private static final char[] ALPHABET_HIRAGANA = new char[96]; // \u3040-\u309F
private static final char[] ALPHABET_KATAKANA = new char[96]; // \u30A0-\u30FF
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1 = new char[5376]; // \u4E00-\u62FF
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2 = new char[5376]; // \u6300-\u77FF
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3 = new char[5376]; // \u7800-\u8CFF
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4 = new char[4864]; // \u8D00-\u9FFF
static {
// this is very experimental: a very small subset of Kanji
for (char a = '\u3400'; a <= '\u34ff'; a++) {
ALPHABET_KANJI[0xff & (a - '\u3400')] = a;
}
for (char a = '\u4e00'; a <= '\u4eff'; a++) {
ALPHABET_KANJI[0xff & (a - '\u4e00') + 256] = a;
}
for (char a = '\u3400'; a <= '\u34ff'; a++) ALPHABET_KANJI[0xff & (a - '\u3400')] = a;
for (char a = '\u4e00'; a <= '\u4eff'; a++) ALPHABET_KANJI[0xff & (a - '\u4e00') + 256] = a;
for (char a = '\u3040'; a <= '\u309F'; a++) ALPHABET_HIRAGANA[0xff & (a - '\u3040')] = a;
for (char a = '\u30A0'; a <= '\u30FF'; a++) ALPHABET_KATAKANA[0xff & (a - '\u30A0')] = a;
for (char a = '\u4E00'; a <= '\u62FF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1[0xff & (a - '\u4E00')] = a;
for (char a = '\u6300'; a <= '\u77FF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2[0xff & (a - '\u6300')] = a;
for (char a = '\u7800'; a <= '\u8CFF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3[0xff & (a - '\u7800')] = a;
for (char a = '\u8D00'; a <= '\u9FFF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4[0xff & (a - '\u8D00')] = a;
}
private static final char[][] ALPHABETS = {ALPHABET_LATIN, ALPHABET_KANJI};
private static final char[][] ALPHABETS = {
ALPHABET_LATIN, ALPHABET_KANJI, ALPHABET_HIRAGANA, ALPHABET_KATAKANA,
ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4};
private static final StringBuilder POISON_STRING = new StringBuilder("\n");
public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors();
private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator();
@ -92,11 +102,16 @@ public class DidYouMean {
alphatest: for (final char[] alpha: ALPHABETS) {
if (isAlphabet(alpha, testchar)) {
this.alphabet = new char[alpha.length];
System.arraycopy(ALPHABET_LATIN, 0, this.alphabet, 0, alpha.length);
System.arraycopy(alpha, 0, this.alphabet, 0, alpha.length);
alphafound = true;
break alphatest;
}
}
if (!alphafound && testchar < 'A') {
this.alphabet = new char[ALPHABET_LATIN.length];
System.arraycopy(ALPHABET_LATIN, 0, this.alphabet, 0, ALPHABET_LATIN.length);
alphafound = true;
}
if (!alphafound) {
// generate generic alphabet using simply a character block of 256 characters
final int firstchar = (0xff & (testchar / 256)) * 256;

@ -424,6 +424,7 @@ public class Segment {
try {
return (int) this.fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.text_t.getSolrFieldName() + ":\"" + word + "\"");
} catch (final Throwable e) {
ConcurrentLog.warn("Segment", "problem with word guess for word: " + word);
ConcurrentLog.logException(e);
return 0;
}

Loading…
Cancel
Save