diff --git a/source/net/yacy/document/WordCache.java b/source/net/yacy/document/WordCache.java index f12c0ea1f..37539735e 100644 --- a/source/net/yacy/document/WordCache.java +++ b/source/net/yacy/document/WordCache.java @@ -33,6 +33,7 @@ import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.zip.GZIPInputStream; @@ -53,9 +54,111 @@ public class WordCache { // dictionaries private final File dictionaryPath; - private TreeSet dict; // the word dictionary - private TreeSet tcid; // the dictionary of reverse words + final Map dictionaries; + public static class Dictionary { + + private TreeSet dict; // the word dictionary + private TreeSet tcid; // the dictionary of reverse words + + public Dictionary(final File file) throws IOException { + this.dict = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); + this.tcid = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); + + InputStream is = new FileInputStream(file); + if (file.getName().endsWith(".gz")) { + is = new GZIPInputStream(is); + } + final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); + String l; + StringBuilder sb; + try { + while ((l = reader.readLine()) != null) { + if (l.length() == 0 || l.charAt(0) == '#') continue; + l = l.trim().toLowerCase(); + if (l.length() < 4) continue; + sb = new StringBuilder(l); + this.dict.add(sb); + this.tcid.add(reverse(sb)); + } + } catch (final IOException e) { + // finish + } + } + + /** + * read the dictionary and construct a set of recommendations to a given string + * @param s input value that is used to match recommendations + * @return set that contains all words that start or end with the input value + */ + public Set recommend(StringBuilder string) { + final Set ret = new HashSet(); + SortedSet t = this.dict.tailSet(string); + for (final StringBuilder r: t) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(r); else break; + } + string = reverse(string); + t = this.tcid.tailSet(string); + for (final StringBuilder r: t) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(reverse(r)); else break; + } + return ret; + } + + /** + * check if the library contains the given word + * @param s the given word + * @return true if the library contains the word + */ + public boolean contains(final StringBuilder s) { + return this.dict.contains(s); + // if the above case is true then it is also true for this.tcid and vice versa + // that means it does not need to be tested as well + } + + /** + * check if the library supports the given word + * A word is supported, if the library contains a word + * that starts or ends with the given word + * @param s the given word + * @return true if the library supports the word + */ + public boolean supports(StringBuilder string) { + SortedSet t = this.dict.tailSet(string); + for (final StringBuilder r: t) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break; + } + string = reverse(string); + t = this.tcid.tailSet(string); + for (final StringBuilder r: t) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break; + } + return false; + } + + /** + * the size of the dictionary + * @return the number of words in the dictionary + */ + public int size() { + return this.dict.size(); + } + + + /** + * a property that is used during the construction of recommendation: + * if the dictionary is too small, then the non-existence of constructed words + * is not relevant for the construction of artificially constructed words + * If this property returns true, all other words must be in the dictionary + * @param minimumWords + * @return + */ + public boolean isRelevant(final int minimumWords) { + return this.dict.size() >= minimumWords; + } + + } + /** * create a new dictionary * This loads all files that ends with '.words' @@ -65,6 +168,7 @@ public class WordCache { */ public WordCache(final File dictionaryPath) { this.dictionaryPath = dictionaryPath; + this.dictionaries = new ConcurrentHashMap(); reload(); } @@ -86,40 +190,18 @@ public class WordCache { * scan the input directory and load all dictionaries (again) */ public void reload() { - this.dict = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); - this.tcid = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); if (this.dictionaryPath == null || !this.dictionaryPath.exists()) return; final String[] files = this.dictionaryPath.list(); for (final String f: files) { if (f.endsWith(".words")) try { - inputStream(new File(this.dictionaryPath, f)); + Dictionary dict = new Dictionary(new File(this.dictionaryPath, f)); + this.dictionaries.put(f.substring(0, f.length() - 6), dict); } catch (final IOException e) { Log.logException(e); } } } - private void inputStream(final File file) throws IOException { - InputStream is = new FileInputStream(file); - if (file.getName().endsWith(".gz")) { - is = new GZIPInputStream(is); - } - final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); - String l; - StringBuilder sb; - try { - while ((l = reader.readLine()) != null) { - if (l.length() == 0 || l.charAt(0) == '#') continue; - l = l.trim().toLowerCase(); - if (l.length() < 4) continue; - sb = new StringBuilder(l); - this.dict.add(sb); - this.tcid.add(reverse(sb)); - } - } catch (final IOException e) { - // finish - } - } private static StringBuilder reverse(final StringBuilder s) { final StringBuilder sb = new StringBuilder(s.length()); @@ -135,10 +217,9 @@ public class WordCache { * @return set that contains all words that start or end with the input value */ public Set recommend(StringBuilder string) { - final Set ret = new HashSet(); - SortedSet t = this.dict.tailSet(string); - for (final StringBuilder r: t) { - if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(r); else break; + Set ret = new HashSet(); + for (Dictionary dict: this.dictionaries.values()) { + ret.addAll(dict.recommend(string)); } final SortedMap u = commonWords.tailMap(string); StringBuilder vv; @@ -148,11 +229,6 @@ public class WordCache { if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(vv, string) && vv.length() > string.length()) ret.add(vv); else break; } } catch (final ConcurrentModificationException e) {} - string = reverse(string); - t = this.tcid.tailSet(string); - for (final StringBuilder r: t) { - if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(reverse(r)); else break; - } return ret; } @@ -162,9 +238,10 @@ public class WordCache { * @return true if the library contains the word */ public boolean contains(final StringBuilder s) { - return this.dict.contains(s); - // if the above case is true then it is also true for this.tcid and vice versa - // that means it does not need to be tested as well + for (Dictionary dict: this.dictionaries.values()) { + if (dict.contains(s)) return true; + } + return false; } /** @@ -175,27 +252,24 @@ public class WordCache { * @return true if the library supports the word */ public boolean supports(StringBuilder string) { - SortedSet t = this.dict.tailSet(string); - for (final StringBuilder r: t) { - if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break; - } - string = reverse(string); - t = this.tcid.tailSet(string); - for (final StringBuilder r: t) { - if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break; + for (Dictionary dict: this.dictionaries.values()) { + if (dict.supports(string)) return true; } return false; } /** - * the size of the dictionay + * the size of the dictionary * @return the number of words in the dictionary */ public int size() { - return this.dict.size(); + int size = 0; + for (Dictionary dict: this.dictionaries.values()) { + size += dict.size(); + } + return size; } - /** * a property that is used during the construction of recommendation: * if the dictionary is too small, then the non-existence of constructed words @@ -205,7 +279,10 @@ public class WordCache { * @return */ public boolean isRelevant(final int minimumWords) { - return this.dict.size() >= minimumWords; + for (Dictionary dict: this.dictionaries.values()) { + if (dict.isRelevant(minimumWords)) return true; + } + return false; } }