redesign of WordCache to be prepared to hold multiple

independent dictionaries. Such dictionaries can then be also used as
simplified vocabularies.
pull/1/head
Michael Peter Christen 13 years ago
parent 972d411d99
commit 91940fdf56

@ -33,6 +33,7 @@ import java.util.Set;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.SortedSet; import java.util.SortedSet;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
@ -53,8 +54,110 @@ public class WordCache {
// dictionaries // dictionaries
private final File dictionaryPath; private final File dictionaryPath;
private TreeSet<StringBuilder> dict; // the word dictionary final Map<String, Dictionary> dictionaries;
private TreeSet<StringBuilder> tcid; // the dictionary of reverse words
public static class Dictionary {
private TreeSet<StringBuilder> dict; // the word dictionary
private TreeSet<StringBuilder> tcid; // the dictionary of reverse words
public Dictionary(final File file) throws IOException {
this.dict = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
this.tcid = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
InputStream is = new FileInputStream(file);
if (file.getName().endsWith(".gz")) {
is = new GZIPInputStream(is);
}
final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String l;
StringBuilder sb;
try {
while ((l = reader.readLine()) != null) {
if (l.length() == 0 || l.charAt(0) == '#') continue;
l = l.trim().toLowerCase();
if (l.length() < 4) continue;
sb = new StringBuilder(l);
this.dict.add(sb);
this.tcid.add(reverse(sb));
}
} catch (final IOException e) {
// finish
}
}
/**
* read the dictionary and construct a set of recommendations to a given string
* @param s input value that is used to match recommendations
* @return set that contains all words that start or end with the input value
*/
public Set<StringBuilder> recommend(StringBuilder string) {
final Set<StringBuilder> ret = new HashSet<StringBuilder>();
SortedSet<StringBuilder> t = this.dict.tailSet(string);
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(r); else break;
}
string = reverse(string);
t = this.tcid.tailSet(string);
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(reverse(r)); else break;
}
return ret;
}
/**
* check if the library contains the given word
* @param s the given word
* @return true if the library contains the word
*/
public boolean contains(final StringBuilder s) {
return this.dict.contains(s);
// if the above case is true then it is also true for this.tcid and vice versa
// that means it does not need to be tested as well
}
/**
* check if the library supports the given word
* A word is supported, if the library contains a word
* that starts or ends with the given word
* @param s the given word
* @return true if the library supports the word
*/
public boolean supports(StringBuilder string) {
SortedSet<StringBuilder> t = this.dict.tailSet(string);
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break;
}
string = reverse(string);
t = this.tcid.tailSet(string);
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break;
}
return false;
}
/**
* the size of the dictionary
* @return the number of words in the dictionary
*/
public int size() {
return this.dict.size();
}
/**
* a property that is used during the construction of recommendation:
* if the dictionary is too small, then the non-existence of constructed words
* is not relevant for the construction of artificially constructed words
* If this property returns true, all other words must be in the dictionary
* @param minimumWords
* @return
*/
public boolean isRelevant(final int minimumWords) {
return this.dict.size() >= minimumWords;
}
}
/** /**
* create a new dictionary * create a new dictionary
@ -65,6 +168,7 @@ public class WordCache {
*/ */
public WordCache(final File dictionaryPath) { public WordCache(final File dictionaryPath) {
this.dictionaryPath = dictionaryPath; this.dictionaryPath = dictionaryPath;
this.dictionaries = new ConcurrentHashMap<String, Dictionary>();
reload(); reload();
} }
@ -86,40 +190,18 @@ public class WordCache {
* scan the input directory and load all dictionaries (again) * scan the input directory and load all dictionaries (again)
*/ */
public void reload() { public void reload() {
this.dict = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
this.tcid = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
if (this.dictionaryPath == null || !this.dictionaryPath.exists()) return; if (this.dictionaryPath == null || !this.dictionaryPath.exists()) return;
final String[] files = this.dictionaryPath.list(); final String[] files = this.dictionaryPath.list();
for (final String f: files) { for (final String f: files) {
if (f.endsWith(".words")) try { if (f.endsWith(".words")) try {
inputStream(new File(this.dictionaryPath, f)); Dictionary dict = new Dictionary(new File(this.dictionaryPath, f));
this.dictionaries.put(f.substring(0, f.length() - 6), dict);
} catch (final IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
} }
} }
} }
private void inputStream(final File file) throws IOException {
InputStream is = new FileInputStream(file);
if (file.getName().endsWith(".gz")) {
is = new GZIPInputStream(is);
}
final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String l;
StringBuilder sb;
try {
while ((l = reader.readLine()) != null) {
if (l.length() == 0 || l.charAt(0) == '#') continue;
l = l.trim().toLowerCase();
if (l.length() < 4) continue;
sb = new StringBuilder(l);
this.dict.add(sb);
this.tcid.add(reverse(sb));
}
} catch (final IOException e) {
// finish
}
}
private static StringBuilder reverse(final StringBuilder s) { private static StringBuilder reverse(final StringBuilder s) {
final StringBuilder sb = new StringBuilder(s.length()); final StringBuilder sb = new StringBuilder(s.length());
@ -135,10 +217,9 @@ public class WordCache {
* @return set that contains all words that start or end with the input value * @return set that contains all words that start or end with the input value
*/ */
public Set<StringBuilder> recommend(StringBuilder string) { public Set<StringBuilder> recommend(StringBuilder string) {
final Set<StringBuilder> ret = new HashSet<StringBuilder>(); Set<StringBuilder> ret = new HashSet<StringBuilder>();
SortedSet<StringBuilder> t = this.dict.tailSet(string); for (Dictionary dict: this.dictionaries.values()) {
for (final StringBuilder r: t) { ret.addAll(dict.recommend(string));
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(r); else break;
} }
final SortedMap<StringBuilder, AtomicInteger> u = commonWords.tailMap(string); final SortedMap<StringBuilder, AtomicInteger> u = commonWords.tailMap(string);
StringBuilder vv; StringBuilder vv;
@ -148,11 +229,6 @@ public class WordCache {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(vv, string) && vv.length() > string.length()) ret.add(vv); else break; if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(vv, string) && vv.length() > string.length()) ret.add(vv); else break;
} }
} catch (final ConcurrentModificationException e) {} } catch (final ConcurrentModificationException e) {}
string = reverse(string);
t = this.tcid.tailSet(string);
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(reverse(r)); else break;
}
return ret; return ret;
} }
@ -162,9 +238,10 @@ public class WordCache {
* @return true if the library contains the word * @return true if the library contains the word
*/ */
public boolean contains(final StringBuilder s) { public boolean contains(final StringBuilder s) {
return this.dict.contains(s); for (Dictionary dict: this.dictionaries.values()) {
// if the above case is true then it is also true for this.tcid and vice versa if (dict.contains(s)) return true;
// that means it does not need to be tested as well }
return false;
} }
/** /**
@ -175,27 +252,24 @@ public class WordCache {
* @return true if the library supports the word * @return true if the library supports the word
*/ */
public boolean supports(StringBuilder string) { public boolean supports(StringBuilder string) {
SortedSet<StringBuilder> t = this.dict.tailSet(string); for (Dictionary dict: this.dictionaries.values()) {
for (final StringBuilder r: t) { if (dict.supports(string)) return true;
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break;
}
string = reverse(string);
t = this.tcid.tailSet(string);
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break;
} }
return false; return false;
} }
/** /**
* the size of the dictionay * the size of the dictionary
* @return the number of words in the dictionary * @return the number of words in the dictionary
*/ */
public int size() { public int size() {
return this.dict.size(); int size = 0;
for (Dictionary dict: this.dictionaries.values()) {
size += dict.size();
}
return size;
} }
/** /**
* a property that is used during the construction of recommendation: * a property that is used during the construction of recommendation:
* if the dictionary is too small, then the non-existence of constructed words * if the dictionary is too small, then the non-existence of constructed words
@ -205,7 +279,10 @@ public class WordCache {
* @return * @return
*/ */
public boolean isRelevant(final int minimumWords) { public boolean isRelevant(final int minimumWords) {
return this.dict.size() >= minimumWords; for (Dictionary dict: this.dictionaries.values()) {
if (dict.isRelevant(minimumWords)) return true;
}
return false;
} }
} }

Loading…
Cancel
Save