moved Vocabulary to cora package (added in git

964406ad17)
pull/1/head
Michael Peter Christen 13 years ago
parent 0284a4d88f
commit 514700291a

@ -23,19 +23,17 @@ package net.yacy.document;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.SimpleVocabulary;
import net.yacy.document.WordCache.Dictionary;
import net.yacy.document.geolocalization.Localization;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
/**
* Autotagging provides a set of tag/print-name properties which can be used to
@ -48,11 +46,11 @@ public class Autotagging {
public final char prefixChar;
private final File autotaggingPath;
private final Map<String, Vocabulary> vocabularies;
private final Map<String, SimpleVocabulary> vocabularies;
private final Map<String, Object> allTags;
public Autotagging(final File autotaggingPath, char prefixChar) {
this.vocabularies = new ConcurrentHashMap<String, Vocabulary>();
this.vocabularies = new ConcurrentHashMap<String, SimpleVocabulary>();
this.autotaggingPath = autotaggingPath;
this.prefixChar = prefixChar;
this.allTags = new ConcurrentHashMap<String, Object>();
@ -81,7 +79,7 @@ public class Autotagging {
File ff = new File(this.autotaggingPath, f);
String vocName = ff.getName();
vocName = vocName.substring(0, vocName.length() - 11);
Vocabulary voc = new Vocabulary(vocName, ff);
SimpleVocabulary voc = new SimpleVocabulary(vocName, ff);
this.vocabularies.put(vocName, voc);
for (String t: voc.tags()) {
this.allTags.put(t, PRESENT);
@ -93,7 +91,7 @@ public class Autotagging {
}
}
public Collection<Vocabulary> getVocabularies() {
public Collection<SimpleVocabulary> getVocabularies() {
return this.vocabularies.values();
}
@ -103,7 +101,7 @@ public class Autotagging {
public void addDictionaries(Map<String, Dictionary> dictionaries) {
for (Map.Entry<String, Dictionary> entry: dictionaries.entrySet()) {
Vocabulary voc = new Vocabulary(entry.getKey(), entry.getValue());
SimpleVocabulary voc = new SimpleVocabulary(entry.getKey(), entry.getValue());
this.vocabularies.put(entry.getKey(), voc);
for (String t: voc.tags()) {
this.allTags.put(t, PRESENT);
@ -112,7 +110,7 @@ public class Autotagging {
}
public void addLocalization(Localization localization) {
Vocabulary voc = new Vocabulary("Locale", localization);
SimpleVocabulary voc = new SimpleVocabulary("Locale", localization);
this.vocabularies.put("Locale", voc);
for (String t: voc.tags()) {
this.allTags.put(t, PRESENT);
@ -139,209 +137,38 @@ public class Autotagging {
public String getPrintTagFromWord(String word) {
if (this.vocabularies.isEmpty()) return null;
Metatag tag;
word = normalizeWord(word);
for (Map.Entry<String, Vocabulary> v: this.vocabularies.entrySet()) {
tag = v.getValue().getMetatag(word);
SimpleVocabulary.Metatag tag;
word = SimpleVocabulary.normalizeWord(word);
for (Map.Entry<String, SimpleVocabulary> v: this.vocabularies.entrySet()) {
tag = v.getValue().getMetatag(this.prefixChar, word);
if (tag != null) return tag.toString();
}
return null;
}
public class Vocabulary {
final String navigatorName;
final Map<String, String> tag2print, print2tag;
public Vocabulary(String name) {
this.navigatorName = name;
this.tag2print = new ConcurrentHashMap<String, String>();
this.print2tag = new ConcurrentHashMap<String, String>();
}
public Vocabulary(String name, File propFile) throws IOException {
this(name);
ArrayList<String> list = FileUtils.getListArray(propFile);
String k, kn, v;
String[] tags;
int p;
vocloop: for (String line: list) {
line = line.trim();
if (line.length() == 0 || line.charAt(0) == '#') {
continue vocloop;
}
p = line.indexOf(':');
if (p < 0) {
p = line.indexOf('=');
}
if (p < 0) {
p = line.indexOf('\t');
}
if (p < 0) {
k = normalizeKey(line);
v = normalizeWord(line);
this.tag2print.put(v, k);
this.print2tag.put(k, v);
continue vocloop;
}
k = normalizeKey(line.substring(0, p));
v = line.substring(p + 1);
tags = v.split(",");
tagloop: for (String t: tags) {
t = normalizeWord(t);
if (t.length() == 0) {
continue tagloop;
}
this.tag2print.put(t, k);
this.print2tag.put(k, t);
}
kn = normalizeWord(k);
this.tag2print.put(kn, k);
this.print2tag.put(k, kn);
}
}
private final String normalizeKey(String k) {
k = k.trim();
k = k.replaceAll(" \\+", ", "); // remove symbols that are bad in a query attribute
k = k.replaceAll(" /", ", ");
k = k.replaceAll("\\+", ",");
k = k.replaceAll("/", ",");
k = k.replaceAll(" ", " ");
return k;
}
public Vocabulary(String name, Localization localization) {
this(name);
Set<String> locNames = localization.locationNames();
for (String loc: locNames) {
this.tag2print.put(loc.toLowerCase(), loc);
this.print2tag.put(loc, loc.toLowerCase());
}
}
public Vocabulary(String name, Dictionary dictionary) {
this(name);
Set<StringBuilder> words = dictionary.getWords();
String s;
for (StringBuilder word: words) {
s = word.toString();
this.tag2print.put(s.toLowerCase(), s);
this.print2tag.put(s, s.toLowerCase());
}
}
public String getName() {
return this.navigatorName;
}
public Metatag getMetatag(final String word) {
String printname = this.tag2print.get(word);
if (printname == null) return null;
return metatag(this.navigatorName, printname);
}
public Set<String> tags() {
return this.tag2print.keySet();
}
@Override
public String toString() {
return this.print2tag.toString();
}
}
private final static Pattern PATTERN_AE = Pattern.compile("\u00E4"); // german umlaute hack for better matching
private final static Pattern PATTERN_OE = Pattern.compile("\u00F6");
private final static Pattern PATTERN_UE = Pattern.compile("\u00FC");
private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF");
private final static Pattern PATTERN_UL = Pattern.compile("_");
private final static Pattern PATTERN_SP = Pattern.compile(" ");
private static final String normalizeWord(String word) {
word = word.trim().toLowerCase();
word = PATTERN_AE.matcher(word).replaceAll("ae");
word = PATTERN_OE.matcher(word).replaceAll("oe");
word = PATTERN_UE.matcher(word).replaceAll("ue");
word = PATTERN_SZ.matcher(word).replaceAll("ss");
return word;
}
public class Metatag {
private final String vocName;
private final String print;
public Metatag(String vocName, String print) {
this.vocName = vocName;
this.print = print;
}
public Metatag(String metatag) throws RuntimeException {
assert metatag.charAt(0) == Autotagging.this.prefixChar;
int p = metatag.indexOf(':');
if (p < 0) throw new RuntimeException("bad metatag: metatag = " + metatag);
this.vocName = metatag.substring(1, p);
this.print = decodeMaskname(metatag.substring(p + 1));
}
public String getVocabularyName() {
return this.vocName;
}
public String getPrintName() {
return this.print;
}
@Override
public String toString() {
return Autotagging.this.prefixChar + this.vocName + ":" + encodePrintname(this.print);
}
@Override
public boolean equals(Object m) {
Metatag m0 = (Metatag) m;
return this.vocName.equals(m0.vocName) && this.print.equals(m0.print);
}
@Override
public int hashCode() {
return this.vocName.hashCode() + this.print.hashCode();
}
}
public static final String encodePrintname(String printname) {
return PATTERN_SP.matcher(printname).replaceAll("_");
}
public static final String decodeMaskname(String maskname) {
return PATTERN_UL.matcher(maskname).replaceAll(" ");
}
public Metatag metatag(String vocName, String print) {
return new Metatag(vocName, print);
}
public Metatag metatag(String metatag) throws RuntimeException {
return new Metatag(metatag);
}
public static boolean metatagAppearIn(final Metatag metatag, final String[] tags) {
public static boolean metatagAppearIn(final SimpleVocabulary.Metatag metatag, final String[] tags) {
String tag = metatag.toString();
for (String s: tags) {
if (tag.equals(s)) return true;
}
return false;
}
public String cleanTagFromAutotagging(final String tagString) {
if (tagString == null || tagString.length() == 0) return "";
String[] tags = PATTERN_SP.split(tagString);
StringBuilder sb = new StringBuilder(tagString.length());
for (String tag : tags) {
if (tag.length() > 0 && tag.charAt(0) != this.prefixChar) {
sb.append(tag).append(' ');
}
}
if (sb.length() == 0) return "";
return sb.substring(0, sb.length() - 1);
public SimpleVocabulary.Metatag metatag(String vocName, String print) {
return new SimpleVocabulary.Metatag(this.prefixChar, vocName, print);
}
public SimpleVocabulary.Metatag metatag(String metatag) {
return new SimpleVocabulary.Metatag(this.prefixChar, metatag);
}
public String cleanTagFromAutotagging(String tagString) {
return SimpleVocabulary.Metatag.cleanTagFromAutotagging(this.prefixChar, tagString);
}
public static void main(String[] args) {
Autotagging a = new Autotagging(new File("DATA/DICTIONARIES/" + LibraryProvider.path_to_autotagging_dictionaries), '$');
for (Map.Entry<String, Vocabulary> entry: a.vocabularies.entrySet()) {
for (Map.Entry<String, SimpleVocabulary> entry: a.vocabularies.entrySet()) {
System.out.println(entry);
}
Set<String> tags = a.getPrintTagsFromText("In die Tueren und Fluchttueren muessen noch Schloesser eingebaut werden");

Loading…
Cancel
Save