Dictionaries from words caches can now be used as autotagging vocabulary

pull/1/head
Michael Christen 13 years ago
parent 91940fdf56
commit eaec14ecc4

@ -28,6 +28,7 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import net.yacy.document.WordCache.Dictionary;
import net.yacy.document.geolocalization.Localization; import net.yacy.document.geolocalization.Localization;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -88,11 +89,15 @@ public class Autotagging {
} }
} }
/* public void addDictionaries(Map<String, Dictionary> dictionaries) {
public void addDidYouMean(WordCache wordCache) { for (Map.Entry<String, Dictionary> entry: dictionaries.entrySet()) {
Vocabulary voc = new Vocabulary(entry.getKey(), entry.getValue());
this.vocabularies.put(entry.getKey(), voc);
for (String t: voc.tags()) {
this.allTags.put(t, PRESENT);
}
}
} }
*/
public void addLocalization(Localization localization) { public void addLocalization(Localization localization) {
Vocabulary voc = new Vocabulary("Locale", localization); Vocabulary voc = new Vocabulary("Locale", localization);
@ -173,6 +178,17 @@ public class Autotagging {
} }
} }
public Vocabulary(String name, Dictionary dictionary) {
this(name);
Set<StringBuilder> words = dictionary.getWords();
String s;
for (StringBuilder word: words) {
s = word.toString();
this.tag2print.put(s.toLowerCase(), s);
this.print2tag.put(s, s.toLowerCase());
}
}
public String getName() { public String getName() {
return this.navigatorName; return this.navigatorName;
} }

@ -112,6 +112,7 @@ public class LibraryProvider
integrateGeonames(); integrateGeonames();
initAutotagging(tagPrefix); initAutotagging(tagPrefix);
autotagging.addLocalization(geoLoc); autotagging.addLocalization(geoLoc);
autotagging.addDictionaries(dymLib.getDictionaries());
} }
public static void integrateOpenGeoDB() { public static void integrateOpenGeoDB() {

@ -58,9 +58,9 @@ public class WordCache {
public static class Dictionary { public static class Dictionary {
private TreeSet<StringBuilder> dict; // the word dictionary private final TreeSet<StringBuilder> dict; // the word dictionary
private TreeSet<StringBuilder> tcid; // the dictionary of reverse words private final TreeSet<StringBuilder> tcid; // the dictionary of reverse words
public Dictionary(final File file) throws IOException { public Dictionary(final File file) throws IOException {
this.dict = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER); this.dict = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
this.tcid = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER); this.tcid = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
@ -74,9 +74,13 @@ public class WordCache {
StringBuilder sb; StringBuilder sb;
try { try {
while ((l = reader.readLine()) != null) { while ((l = reader.readLine()) != null) {
if (l.length() == 0 || l.charAt(0) == '#') continue; if (l.length() == 0 || l.charAt(0) == '#') {
continue;
}
l = l.trim().toLowerCase(); l = l.trim().toLowerCase();
if (l.length() < 4) continue; if (l.length() < 4) {
continue;
}
sb = new StringBuilder(l); sb = new StringBuilder(l);
this.dict.add(sb); this.dict.add(sb);
this.tcid.add(reverse(sb)); this.tcid.add(reverse(sb));
@ -85,7 +89,7 @@ public class WordCache {
// finish // finish
} }
} }
/** /**
* read the dictionary and construct a set of recommendations to a given string * read the dictionary and construct a set of recommendations to a given string
* @param s input value that is used to match recommendations * @param s input value that is used to match recommendations
@ -95,12 +99,20 @@ public class WordCache {
final Set<StringBuilder> ret = new HashSet<StringBuilder>(); final Set<StringBuilder> ret = new HashSet<StringBuilder>();
SortedSet<StringBuilder> t = this.dict.tailSet(string); SortedSet<StringBuilder> t = this.dict.tailSet(string);
for (final StringBuilder r: t) { for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(r); else break; if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) {
ret.add(r);
} else {
break;
}
} }
string = reverse(string); string = reverse(string);
t = this.tcid.tailSet(string); t = this.tcid.tailSet(string);
for (final StringBuilder r: t) { for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(reverse(r)); else break; if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) {
ret.add(reverse(r));
} else {
break;
}
} }
return ret; return ret;
} }
@ -126,12 +138,20 @@ public class WordCache {
public boolean supports(StringBuilder string) { public boolean supports(StringBuilder string) {
SortedSet<StringBuilder> t = this.dict.tailSet(string); SortedSet<StringBuilder> t = this.dict.tailSet(string);
for (final StringBuilder r: t) { for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break; if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) {
return true;
} else {
break;
}
} }
string = reverse(string); string = reverse(string);
t = this.tcid.tailSet(string); t = this.tcid.tailSet(string);
for (final StringBuilder r: t) { for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break; if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) {
return true;
} else {
break;
}
} }
return false; return false;
} }
@ -157,8 +177,16 @@ public class WordCache {
return this.dict.size() >= minimumWords; return this.dict.size() >= minimumWords;
} }
public Set<StringBuilder> getWords() {
return this.dict;
}
} }
public Map<String, Dictionary> getDictionaries() {
return this.dictionaries;
}
/** /**
* create a new dictionary * create a new dictionary
* This loads all files that ends with '.words' * This loads all files that ends with '.words'
@ -177,9 +205,15 @@ public class WordCache {
* @param word * @param word
*/ */
public static void learn(final StringBuilder word) { public static void learn(final StringBuilder word) {
if (word == null) return; if (word == null) {
if (word.length() < commonWordsMinLength) return; return;
if (MemoryControl.shortStatus()) commonWords.clear(); }
if (word.length() < commonWordsMinLength) {
return;
}
if (MemoryControl.shortStatus()) {
commonWords.clear();
}
commonWords.inc(word); commonWords.inc(word);
if (!(commonWords.sizeSmaller(commonWordsMaxSize))) { if (!(commonWords.sizeSmaller(commonWordsMaxSize))) {
commonWords.shrinkToMaxSize(commonWordsMaxSize / 2); commonWords.shrinkToMaxSize(commonWordsMaxSize / 2);
@ -190,14 +224,18 @@ public class WordCache {
* scan the input directory and load all dictionaries (again) * scan the input directory and load all dictionaries (again)
*/ */
public void reload() { public void reload() {
if (this.dictionaryPath == null || !this.dictionaryPath.exists()) return; if (this.dictionaryPath == null || !this.dictionaryPath.exists()) {
return;
}
final String[] files = this.dictionaryPath.list(); final String[] files = this.dictionaryPath.list();
for (final String f: files) { for (final String f: files) {
if (f.endsWith(".words")) try { if (f.endsWith(".words")) {
Dictionary dict = new Dictionary(new File(this.dictionaryPath, f)); try {
this.dictionaries.put(f.substring(0, f.length() - 6), dict); Dictionary dict = new Dictionary(new File(this.dictionaryPath, f));
} catch (final IOException e) { this.dictionaries.put(f.substring(0, f.length() - 6), dict);
Log.logException(e); } catch (final IOException e) {
Log.logException(e);
}
} }
} }
} }
@ -226,7 +264,11 @@ public class WordCache {
try { try {
for (final Map.Entry<StringBuilder, AtomicInteger> v: u.entrySet()) { for (final Map.Entry<StringBuilder, AtomicInteger> v: u.entrySet()) {
vv = v.getKey(); vv = v.getKey();
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(vv, string) && vv.length() > string.length()) ret.add(vv); else break; if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(vv, string) && vv.length() > string.length()) {
ret.add(vv);
} else {
break;
}
} }
} catch (final ConcurrentModificationException e) {} } catch (final ConcurrentModificationException e) {}
return ret; return ret;
@ -239,7 +281,9 @@ public class WordCache {
*/ */
public boolean contains(final StringBuilder s) { public boolean contains(final StringBuilder s) {
for (Dictionary dict: this.dictionaries.values()) { for (Dictionary dict: this.dictionaries.values()) {
if (dict.contains(s)) return true; if (dict.contains(s)) {
return true;
}
} }
return false; return false;
} }
@ -253,7 +297,9 @@ public class WordCache {
*/ */
public boolean supports(StringBuilder string) { public boolean supports(StringBuilder string) {
for (Dictionary dict: this.dictionaries.values()) { for (Dictionary dict: this.dictionaries.values()) {
if (dict.supports(string)) return true; if (dict.supports(string)) {
return true;
}
} }
return false; return false;
} }
@ -280,7 +326,9 @@ public class WordCache {
*/ */
public boolean isRelevant(final int minimumWords) { public boolean isRelevant(final int minimumWords) {
for (Dictionary dict: this.dictionaries.values()) { for (Dictionary dict: this.dictionaries.values()) {
if (dict.isRelevant(minimumWords)) return true; if (dict.isRelevant(minimumWords)) {
return true;
}
} }
return false; return false;
} }

Loading…
Cancel
Save