From 514700291aa4835192b722793d32ad63928df232 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 4 Jun 2012 23:41:36 +0200 Subject: [PATCH] moved Vocabulary to cora package (added in git 964406ad17a461322ab3478e9e832c302a9e9dc5) --- source/net/yacy/document/Autotagging.java | 221 +++------------------- 1 file changed, 24 insertions(+), 197 deletions(-) diff --git a/source/net/yacy/document/Autotagging.java b/source/net/yacy/document/Autotagging.java index 4818d06e2..56648e4d5 100644 --- a/source/net/yacy/document/Autotagging.java +++ b/source/net/yacy/document/Autotagging.java @@ -23,19 +23,17 @@ package net.yacy.document; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; -import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import java.util.regex.Pattern; import net.yacy.cora.document.UTF8; +import net.yacy.cora.lod.SimpleVocabulary; import net.yacy.document.WordCache.Dictionary; import net.yacy.document.geolocalization.Localization; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.FileUtils; /** * Autotagging provides a set of tag/print-name properties which can be used to @@ -48,11 +46,11 @@ public class Autotagging { public final char prefixChar; private final File autotaggingPath; - private final Map vocabularies; + private final Map vocabularies; private final Map allTags; public Autotagging(final File autotaggingPath, char prefixChar) { - this.vocabularies = new ConcurrentHashMap(); + this.vocabularies = new ConcurrentHashMap(); this.autotaggingPath = autotaggingPath; this.prefixChar = prefixChar; this.allTags = new ConcurrentHashMap(); @@ -81,7 +79,7 @@ public class Autotagging { File ff = new File(this.autotaggingPath, f); String vocName = ff.getName(); vocName = vocName.substring(0, vocName.length() - 11); - Vocabulary voc = new Vocabulary(vocName, ff); + SimpleVocabulary voc = new SimpleVocabulary(vocName, ff); this.vocabularies.put(vocName, voc); for (String t: voc.tags()) { this.allTags.put(t, PRESENT); @@ -93,7 +91,7 @@ public class Autotagging { } } - public Collection getVocabularies() { + public Collection getVocabularies() { return this.vocabularies.values(); } @@ -103,7 +101,7 @@ public class Autotagging { public void addDictionaries(Map dictionaries) { for (Map.Entry entry: dictionaries.entrySet()) { - Vocabulary voc = new Vocabulary(entry.getKey(), entry.getValue()); + SimpleVocabulary voc = new SimpleVocabulary(entry.getKey(), entry.getValue()); this.vocabularies.put(entry.getKey(), voc); for (String t: voc.tags()) { this.allTags.put(t, PRESENT); @@ -112,7 +110,7 @@ public class Autotagging { } public void addLocalization(Localization localization) { - Vocabulary voc = new Vocabulary("Locale", localization); + SimpleVocabulary voc = new SimpleVocabulary("Locale", localization); this.vocabularies.put("Locale", voc); for (String t: voc.tags()) { this.allTags.put(t, PRESENT); @@ -139,209 +137,38 @@ public class Autotagging { public String getPrintTagFromWord(String word) { if (this.vocabularies.isEmpty()) return null; - Metatag tag; - word = normalizeWord(word); - for (Map.Entry v: this.vocabularies.entrySet()) { - tag = v.getValue().getMetatag(word); + SimpleVocabulary.Metatag tag; + word = SimpleVocabulary.normalizeWord(word); + for (Map.Entry v: this.vocabularies.entrySet()) { + tag = v.getValue().getMetatag(this.prefixChar, word); if (tag != null) return tag.toString(); } return null; } - public class Vocabulary { - - final String navigatorName; - final Map tag2print, print2tag; - - public Vocabulary(String name) { - this.navigatorName = name; - this.tag2print = new ConcurrentHashMap(); - this.print2tag = new ConcurrentHashMap(); - } - - public Vocabulary(String name, File propFile) throws IOException { - this(name); - ArrayList list = FileUtils.getListArray(propFile); - String k, kn, v; - String[] tags; - int p; - vocloop: for (String line: list) { - line = line.trim(); - if (line.length() == 0 || line.charAt(0) == '#') { - continue vocloop; - } - p = line.indexOf(':'); - if (p < 0) { - p = line.indexOf('='); - } - if (p < 0) { - p = line.indexOf('\t'); - } - if (p < 0) { - k = normalizeKey(line); - v = normalizeWord(line); - this.tag2print.put(v, k); - this.print2tag.put(k, v); - continue vocloop; - } - k = normalizeKey(line.substring(0, p)); - v = line.substring(p + 1); - tags = v.split(","); - tagloop: for (String t: tags) { - t = normalizeWord(t); - if (t.length() == 0) { - continue tagloop; - } - this.tag2print.put(t, k); - this.print2tag.put(k, t); - } - kn = normalizeWord(k); - this.tag2print.put(kn, k); - this.print2tag.put(k, kn); - } - } - - private final String normalizeKey(String k) { - k = k.trim(); - k = k.replaceAll(" \\+", ", "); // remove symbols that are bad in a query attribute - k = k.replaceAll(" /", ", "); - k = k.replaceAll("\\+", ","); - k = k.replaceAll("/", ","); - k = k.replaceAll(" ", " "); - return k; - } - - public Vocabulary(String name, Localization localization) { - this(name); - Set locNames = localization.locationNames(); - for (String loc: locNames) { - this.tag2print.put(loc.toLowerCase(), loc); - this.print2tag.put(loc, loc.toLowerCase()); - } - } - - public Vocabulary(String name, Dictionary dictionary) { - this(name); - Set words = dictionary.getWords(); - String s; - for (StringBuilder word: words) { - s = word.toString(); - this.tag2print.put(s.toLowerCase(), s); - this.print2tag.put(s, s.toLowerCase()); - } - } - - public String getName() { - return this.navigatorName; - } - - public Metatag getMetatag(final String word) { - String printname = this.tag2print.get(word); - if (printname == null) return null; - return metatag(this.navigatorName, printname); - } - - public Set tags() { - return this.tag2print.keySet(); - } - - @Override - public String toString() { - return this.print2tag.toString(); - } - } - - private final static Pattern PATTERN_AE = Pattern.compile("\u00E4"); // german umlaute hack for better matching - private final static Pattern PATTERN_OE = Pattern.compile("\u00F6"); - private final static Pattern PATTERN_UE = Pattern.compile("\u00FC"); - private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF"); - private final static Pattern PATTERN_UL = Pattern.compile("_"); - private final static Pattern PATTERN_SP = Pattern.compile(" "); - - private static final String normalizeWord(String word) { - word = word.trim().toLowerCase(); - word = PATTERN_AE.matcher(word).replaceAll("ae"); - word = PATTERN_OE.matcher(word).replaceAll("oe"); - word = PATTERN_UE.matcher(word).replaceAll("ue"); - word = PATTERN_SZ.matcher(word).replaceAll("ss"); - return word; - } - - public class Metatag { - private final String vocName; - private final String print; - public Metatag(String vocName, String print) { - this.vocName = vocName; - this.print = print; - } - public Metatag(String metatag) throws RuntimeException { - assert metatag.charAt(0) == Autotagging.this.prefixChar; - int p = metatag.indexOf(':'); - if (p < 0) throw new RuntimeException("bad metatag: metatag = " + metatag); - this.vocName = metatag.substring(1, p); - this.print = decodeMaskname(metatag.substring(p + 1)); - } - public String getVocabularyName() { - return this.vocName; - } - public String getPrintName() { - return this.print; - } - @Override - public String toString() { - return Autotagging.this.prefixChar + this.vocName + ":" + encodePrintname(this.print); - } - @Override - public boolean equals(Object m) { - Metatag m0 = (Metatag) m; - return this.vocName.equals(m0.vocName) && this.print.equals(m0.print); - } - @Override - public int hashCode() { - return this.vocName.hashCode() + this.print.hashCode(); - } - } - - public static final String encodePrintname(String printname) { - return PATTERN_SP.matcher(printname).replaceAll("_"); - } - - public static final String decodeMaskname(String maskname) { - return PATTERN_UL.matcher(maskname).replaceAll(" "); - } - - public Metatag metatag(String vocName, String print) { - return new Metatag(vocName, print); - } - - public Metatag metatag(String metatag) throws RuntimeException { - return new Metatag(metatag); - } - - public static boolean metatagAppearIn(final Metatag metatag, final String[] tags) { + public static boolean metatagAppearIn(final SimpleVocabulary.Metatag metatag, final String[] tags) { String tag = metatag.toString(); for (String s: tags) { if (tag.equals(s)) return true; } return false; } - - public String cleanTagFromAutotagging(final String tagString) { - if (tagString == null || tagString.length() == 0) return ""; - String[] tags = PATTERN_SP.split(tagString); - StringBuilder sb = new StringBuilder(tagString.length()); - for (String tag : tags) { - if (tag.length() > 0 && tag.charAt(0) != this.prefixChar) { - sb.append(tag).append(' '); - } - } - if (sb.length() == 0) return ""; - return sb.substring(0, sb.length() - 1); + + public SimpleVocabulary.Metatag metatag(String vocName, String print) { + return new SimpleVocabulary.Metatag(this.prefixChar, vocName, print); } + public SimpleVocabulary.Metatag metatag(String metatag) { + return new SimpleVocabulary.Metatag(this.prefixChar, metatag); + } + + public String cleanTagFromAutotagging(String tagString) { + return SimpleVocabulary.Metatag.cleanTagFromAutotagging(this.prefixChar, tagString); + } + public static void main(String[] args) { Autotagging a = new Autotagging(new File("DATA/DICTIONARIES/" + LibraryProvider.path_to_autotagging_dictionaries), '$'); - for (Map.Entry entry: a.vocabularies.entrySet()) { + for (Map.Entry entry: a.vocabularies.entrySet()) { System.out.println(entry); } Set tags = a.getPrintTagsFromText("In die Tueren und Fluchttueren muessen noch Schloesser eingebaut werden");