|
|
|
@ -23,19 +23,17 @@ package net.yacy.document;
|
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.Collection;
|
|
|
|
|
import java.util.HashSet;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.Set;
|
|
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
|
|
|
|
|
import net.yacy.cora.document.UTF8;
|
|
|
|
|
import net.yacy.cora.lod.SimpleVocabulary;
|
|
|
|
|
import net.yacy.document.WordCache.Dictionary;
|
|
|
|
|
import net.yacy.document.geolocalization.Localization;
|
|
|
|
|
import net.yacy.kelondro.logging.Log;
|
|
|
|
|
import net.yacy.kelondro.util.FileUtils;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Autotagging provides a set of tag/print-name properties which can be used to
|
|
|
|
@ -48,11 +46,11 @@ public class Autotagging {
|
|
|
|
|
|
|
|
|
|
public final char prefixChar;
|
|
|
|
|
private final File autotaggingPath;
|
|
|
|
|
private final Map<String, Vocabulary> vocabularies;
|
|
|
|
|
private final Map<String, SimpleVocabulary> vocabularies;
|
|
|
|
|
private final Map<String, Object> allTags;
|
|
|
|
|
|
|
|
|
|
public Autotagging(final File autotaggingPath, char prefixChar) {
|
|
|
|
|
this.vocabularies = new ConcurrentHashMap<String, Vocabulary>();
|
|
|
|
|
this.vocabularies = new ConcurrentHashMap<String, SimpleVocabulary>();
|
|
|
|
|
this.autotaggingPath = autotaggingPath;
|
|
|
|
|
this.prefixChar = prefixChar;
|
|
|
|
|
this.allTags = new ConcurrentHashMap<String, Object>();
|
|
|
|
@ -81,7 +79,7 @@ public class Autotagging {
|
|
|
|
|
File ff = new File(this.autotaggingPath, f);
|
|
|
|
|
String vocName = ff.getName();
|
|
|
|
|
vocName = vocName.substring(0, vocName.length() - 11);
|
|
|
|
|
Vocabulary voc = new Vocabulary(vocName, ff);
|
|
|
|
|
SimpleVocabulary voc = new SimpleVocabulary(vocName, ff);
|
|
|
|
|
this.vocabularies.put(vocName, voc);
|
|
|
|
|
for (String t: voc.tags()) {
|
|
|
|
|
this.allTags.put(t, PRESENT);
|
|
|
|
@ -93,7 +91,7 @@ public class Autotagging {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Collection<Vocabulary> getVocabularies() {
|
|
|
|
|
public Collection<SimpleVocabulary> getVocabularies() {
|
|
|
|
|
return this.vocabularies.values();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -103,7 +101,7 @@ public class Autotagging {
|
|
|
|
|
|
|
|
|
|
public void addDictionaries(Map<String, Dictionary> dictionaries) {
|
|
|
|
|
for (Map.Entry<String, Dictionary> entry: dictionaries.entrySet()) {
|
|
|
|
|
Vocabulary voc = new Vocabulary(entry.getKey(), entry.getValue());
|
|
|
|
|
SimpleVocabulary voc = new SimpleVocabulary(entry.getKey(), entry.getValue());
|
|
|
|
|
this.vocabularies.put(entry.getKey(), voc);
|
|
|
|
|
for (String t: voc.tags()) {
|
|
|
|
|
this.allTags.put(t, PRESENT);
|
|
|
|
@ -112,7 +110,7 @@ public class Autotagging {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void addLocalization(Localization localization) {
|
|
|
|
|
Vocabulary voc = new Vocabulary("Locale", localization);
|
|
|
|
|
SimpleVocabulary voc = new SimpleVocabulary("Locale", localization);
|
|
|
|
|
this.vocabularies.put("Locale", voc);
|
|
|
|
|
for (String t: voc.tags()) {
|
|
|
|
|
this.allTags.put(t, PRESENT);
|
|
|
|
@ -139,209 +137,38 @@ public class Autotagging {
|
|
|
|
|
|
|
|
|
|
public String getPrintTagFromWord(String word) {
|
|
|
|
|
if (this.vocabularies.isEmpty()) return null;
|
|
|
|
|
Metatag tag;
|
|
|
|
|
word = normalizeWord(word);
|
|
|
|
|
for (Map.Entry<String, Vocabulary> v: this.vocabularies.entrySet()) {
|
|
|
|
|
tag = v.getValue().getMetatag(word);
|
|
|
|
|
SimpleVocabulary.Metatag tag;
|
|
|
|
|
word = SimpleVocabulary.normalizeWord(word);
|
|
|
|
|
for (Map.Entry<String, SimpleVocabulary> v: this.vocabularies.entrySet()) {
|
|
|
|
|
tag = v.getValue().getMetatag(this.prefixChar, word);
|
|
|
|
|
if (tag != null) return tag.toString();
|
|
|
|
|
}
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public class Vocabulary {
|
|
|
|
|
|
|
|
|
|
final String navigatorName;
|
|
|
|
|
final Map<String, String> tag2print, print2tag;
|
|
|
|
|
|
|
|
|
|
public Vocabulary(String name) {
|
|
|
|
|
this.navigatorName = name;
|
|
|
|
|
this.tag2print = new ConcurrentHashMap<String, String>();
|
|
|
|
|
this.print2tag = new ConcurrentHashMap<String, String>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Vocabulary(String name, File propFile) throws IOException {
|
|
|
|
|
this(name);
|
|
|
|
|
ArrayList<String> list = FileUtils.getListArray(propFile);
|
|
|
|
|
String k, kn, v;
|
|
|
|
|
String[] tags;
|
|
|
|
|
int p;
|
|
|
|
|
vocloop: for (String line: list) {
|
|
|
|
|
line = line.trim();
|
|
|
|
|
if (line.length() == 0 || line.charAt(0) == '#') {
|
|
|
|
|
continue vocloop;
|
|
|
|
|
}
|
|
|
|
|
p = line.indexOf(':');
|
|
|
|
|
if (p < 0) {
|
|
|
|
|
p = line.indexOf('=');
|
|
|
|
|
}
|
|
|
|
|
if (p < 0) {
|
|
|
|
|
p = line.indexOf('\t');
|
|
|
|
|
}
|
|
|
|
|
if (p < 0) {
|
|
|
|
|
k = normalizeKey(line);
|
|
|
|
|
v = normalizeWord(line);
|
|
|
|
|
this.tag2print.put(v, k);
|
|
|
|
|
this.print2tag.put(k, v);
|
|
|
|
|
continue vocloop;
|
|
|
|
|
}
|
|
|
|
|
k = normalizeKey(line.substring(0, p));
|
|
|
|
|
v = line.substring(p + 1);
|
|
|
|
|
tags = v.split(",");
|
|
|
|
|
tagloop: for (String t: tags) {
|
|
|
|
|
t = normalizeWord(t);
|
|
|
|
|
if (t.length() == 0) {
|
|
|
|
|
continue tagloop;
|
|
|
|
|
}
|
|
|
|
|
this.tag2print.put(t, k);
|
|
|
|
|
this.print2tag.put(k, t);
|
|
|
|
|
}
|
|
|
|
|
kn = normalizeWord(k);
|
|
|
|
|
this.tag2print.put(kn, k);
|
|
|
|
|
this.print2tag.put(k, kn);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private final String normalizeKey(String k) {
|
|
|
|
|
k = k.trim();
|
|
|
|
|
k = k.replaceAll(" \\+", ", "); // remove symbols that are bad in a query attribute
|
|
|
|
|
k = k.replaceAll(" /", ", ");
|
|
|
|
|
k = k.replaceAll("\\+", ",");
|
|
|
|
|
k = k.replaceAll("/", ",");
|
|
|
|
|
k = k.replaceAll(" ", " ");
|
|
|
|
|
return k;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Vocabulary(String name, Localization localization) {
|
|
|
|
|
this(name);
|
|
|
|
|
Set<String> locNames = localization.locationNames();
|
|
|
|
|
for (String loc: locNames) {
|
|
|
|
|
this.tag2print.put(loc.toLowerCase(), loc);
|
|
|
|
|
this.print2tag.put(loc, loc.toLowerCase());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Vocabulary(String name, Dictionary dictionary) {
|
|
|
|
|
this(name);
|
|
|
|
|
Set<StringBuilder> words = dictionary.getWords();
|
|
|
|
|
String s;
|
|
|
|
|
for (StringBuilder word: words) {
|
|
|
|
|
s = word.toString();
|
|
|
|
|
this.tag2print.put(s.toLowerCase(), s);
|
|
|
|
|
this.print2tag.put(s, s.toLowerCase());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getName() {
|
|
|
|
|
return this.navigatorName;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Metatag getMetatag(final String word) {
|
|
|
|
|
String printname = this.tag2print.get(word);
|
|
|
|
|
if (printname == null) return null;
|
|
|
|
|
return metatag(this.navigatorName, printname);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Set<String> tags() {
|
|
|
|
|
return this.tag2print.keySet();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public String toString() {
|
|
|
|
|
return this.print2tag.toString();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private final static Pattern PATTERN_AE = Pattern.compile("\u00E4"); // german umlaute hack for better matching
|
|
|
|
|
private final static Pattern PATTERN_OE = Pattern.compile("\u00F6");
|
|
|
|
|
private final static Pattern PATTERN_UE = Pattern.compile("\u00FC");
|
|
|
|
|
private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF");
|
|
|
|
|
private final static Pattern PATTERN_UL = Pattern.compile("_");
|
|
|
|
|
private final static Pattern PATTERN_SP = Pattern.compile(" ");
|
|
|
|
|
|
|
|
|
|
private static final String normalizeWord(String word) {
|
|
|
|
|
word = word.trim().toLowerCase();
|
|
|
|
|
word = PATTERN_AE.matcher(word).replaceAll("ae");
|
|
|
|
|
word = PATTERN_OE.matcher(word).replaceAll("oe");
|
|
|
|
|
word = PATTERN_UE.matcher(word).replaceAll("ue");
|
|
|
|
|
word = PATTERN_SZ.matcher(word).replaceAll("ss");
|
|
|
|
|
return word;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public class Metatag {
|
|
|
|
|
private final String vocName;
|
|
|
|
|
private final String print;
|
|
|
|
|
public Metatag(String vocName, String print) {
|
|
|
|
|
this.vocName = vocName;
|
|
|
|
|
this.print = print;
|
|
|
|
|
}
|
|
|
|
|
public Metatag(String metatag) throws RuntimeException {
|
|
|
|
|
assert metatag.charAt(0) == Autotagging.this.prefixChar;
|
|
|
|
|
int p = metatag.indexOf(':');
|
|
|
|
|
if (p < 0) throw new RuntimeException("bad metatag: metatag = " + metatag);
|
|
|
|
|
this.vocName = metatag.substring(1, p);
|
|
|
|
|
this.print = decodeMaskname(metatag.substring(p + 1));
|
|
|
|
|
}
|
|
|
|
|
public String getVocabularyName() {
|
|
|
|
|
return this.vocName;
|
|
|
|
|
}
|
|
|
|
|
public String getPrintName() {
|
|
|
|
|
return this.print;
|
|
|
|
|
}
|
|
|
|
|
@Override
|
|
|
|
|
public String toString() {
|
|
|
|
|
return Autotagging.this.prefixChar + this.vocName + ":" + encodePrintname(this.print);
|
|
|
|
|
}
|
|
|
|
|
@Override
|
|
|
|
|
public boolean equals(Object m) {
|
|
|
|
|
Metatag m0 = (Metatag) m;
|
|
|
|
|
return this.vocName.equals(m0.vocName) && this.print.equals(m0.print);
|
|
|
|
|
}
|
|
|
|
|
@Override
|
|
|
|
|
public int hashCode() {
|
|
|
|
|
return this.vocName.hashCode() + this.print.hashCode();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static final String encodePrintname(String printname) {
|
|
|
|
|
return PATTERN_SP.matcher(printname).replaceAll("_");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static final String decodeMaskname(String maskname) {
|
|
|
|
|
return PATTERN_UL.matcher(maskname).replaceAll(" ");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Metatag metatag(String vocName, String print) {
|
|
|
|
|
return new Metatag(vocName, print);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Metatag metatag(String metatag) throws RuntimeException {
|
|
|
|
|
return new Metatag(metatag);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static boolean metatagAppearIn(final Metatag metatag, final String[] tags) {
|
|
|
|
|
public static boolean metatagAppearIn(final SimpleVocabulary.Metatag metatag, final String[] tags) {
|
|
|
|
|
String tag = metatag.toString();
|
|
|
|
|
for (String s: tags) {
|
|
|
|
|
if (tag.equals(s)) return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String cleanTagFromAutotagging(final String tagString) {
|
|
|
|
|
if (tagString == null || tagString.length() == 0) return "";
|
|
|
|
|
String[] tags = PATTERN_SP.split(tagString);
|
|
|
|
|
StringBuilder sb = new StringBuilder(tagString.length());
|
|
|
|
|
for (String tag : tags) {
|
|
|
|
|
if (tag.length() > 0 && tag.charAt(0) != this.prefixChar) {
|
|
|
|
|
sb.append(tag).append(' ');
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (sb.length() == 0) return "";
|
|
|
|
|
return sb.substring(0, sb.length() - 1);
|
|
|
|
|
|
|
|
|
|
public SimpleVocabulary.Metatag metatag(String vocName, String print) {
|
|
|
|
|
return new SimpleVocabulary.Metatag(this.prefixChar, vocName, print);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public SimpleVocabulary.Metatag metatag(String metatag) {
|
|
|
|
|
return new SimpleVocabulary.Metatag(this.prefixChar, metatag);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String cleanTagFromAutotagging(String tagString) {
|
|
|
|
|
return SimpleVocabulary.Metatag.cleanTagFromAutotagging(this.prefixChar, tagString);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
|
|
Autotagging a = new Autotagging(new File("DATA/DICTIONARIES/" + LibraryProvider.path_to_autotagging_dictionaries), '$');
|
|
|
|
|
for (Map.Entry<String, Vocabulary> entry: a.vocabularies.entrySet()) {
|
|
|
|
|
for (Map.Entry<String, SimpleVocabulary> entry: a.vocabularies.entrySet()) {
|
|
|
|
|
System.out.println(entry);
|
|
|
|
|
}
|
|
|
|
|
Set<String> tags = a.getPrintTagsFromText("In die Tueren und Fluchttueren muessen noch Schloesser eingebaut werden");
|
|
|
|
|