diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index 4350d5655..9568816e2 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -57,7 +57,9 @@ public class DictionaryLoader_p { prop.put(dictionary.nickname + "ActionDeactivated", 0); } - if (post == null) return prop; + if (post == null) { + return prop; + } // GEON0 if (post.containsKey("geon0Load")) { @@ -67,6 +69,7 @@ public class DictionaryLoader_p { final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file()); LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file())); + LibraryProvider.autotagging.addLocalization(LibraryProvider.geoLoc); prop.put("geon0Status", LibraryProvider.Dictionary.GEON0.file().exists() ? 1 : 0); prop.put("geon0ActionLoaded", 1); } catch (final MalformedURLException e) { @@ -96,6 +99,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon0Activate")) { LibraryProvider.Dictionary.GEON0.fileDisabled().renameTo(LibraryProvider.Dictionary.GEON0.file()); LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file())); + LibraryProvider.autotagging.addLocalization(LibraryProvider.geoLoc); prop.put("geon0ActionActivated", 1); } @@ -108,6 +112,7 @@ public class DictionaryLoader_p { FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file()); LibraryProvider.geoLoc.removeLocalization(LibraryProvider.Dictionary.GEODB0.nickname); LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEODB1.nickname, new OpenGeoDBLocalization(LibraryProvider.Dictionary.GEODB1.file(), false)); + LibraryProvider.autotagging.addLocalization(LibraryProvider.geoLoc); prop.put("geo1Status", LibraryProvider.Dictionary.GEODB1.file().exists() ? 1 : 0); prop.put("geo1ActionLoaded", 1); } catch (final MalformedURLException e) { @@ -137,6 +142,7 @@ public class DictionaryLoader_p { if (post.containsKey("geo1Activate")) { LibraryProvider.Dictionary.GEODB1.fileDisabled().renameTo(LibraryProvider.Dictionary.GEODB1.file()); LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEODB1.nickname, new OpenGeoDBLocalization(LibraryProvider.Dictionary.GEODB1.file(), false)); + LibraryProvider.autotagging.addLocalization(LibraryProvider.geoLoc); prop.put("geo1ActionActivated", 1); } diff --git a/source/net/yacy/document/Autotagging.java b/source/net/yacy/document/Autotagging.java new file mode 100644 index 000000000..724101d17 --- /dev/null +++ b/source/net/yacy/document/Autotagging.java @@ -0,0 +1,238 @@ +/** + * Autotagging + * Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * first published 07.01.2012 on http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +import net.yacy.document.geolocalization.Localization; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.FileUtils; + +/** + * Autotagging provides a set of tag/print-name properties which can be used to + * - create tags from texts automatically + * - create navigation entries for given tags + */ +public class Autotagging { + + final static Object PRESENT = new Object(); + + final char prefixChar; + final File autotaggingPath; + final Map vocabularies; + final Map allTags; + + public Autotagging(final File autotaggingPath, char prefixChar) { + this.vocabularies = new ConcurrentHashMap(); + this.autotaggingPath = autotaggingPath; + this.prefixChar = prefixChar; + this.allTags = new ConcurrentHashMap(); + reload(); + } + + + /** + * scan the input directory and load all tag tables (again) + * a tag table is a property file where + * the key is the tag name + * the value is the visible name for the tag (shown in a navigator) + * properties without values are allowed (the value is then set to the key) + * also the value can be used as a tag + */ + public void reload() { + this.vocabularies.clear(); + this.allTags.clear(); + if (this.autotaggingPath == null || !this.autotaggingPath.exists()) { + return; + } + final String[] files = this.autotaggingPath.list(); + for (final String f: files) { + if (f.endsWith(".vocabulary")) { + try { + File ff = new File(this.autotaggingPath, f); + String vocName = ff.getName(); + vocName = vocName.substring(0, vocName.length() - 11); + Vocabulary voc = new Vocabulary(vocName, ff); + this.vocabularies.put(vocName, voc); + for (String t: voc.tags()) { + this.allTags.put(t, PRESENT); + } + } catch (final IOException e) { + Log.logException(e); + } + } + } + } + + /* + public void addDidYouMean(WordCache wordCache) { + + } + */ + + public void addLocalization(Localization localization) { + Vocabulary voc = new Vocabulary("Locale", localization); + this.vocabularies.put("Locale", voc); + for (String t: voc.tags()) { + this.allTags.put(t, PRESENT); + } + } + + /** + * produce a set of tags for a given text. + * The set contains the names of the tags with a prefix character at the front + * @param text + * @return + */ + public Set tags(String text) { + Set as = new HashSet(); + + return as; + } + + public static class Vocabulary { + + final String navigatorName; + final Map tag2print, print2tag; + + public Vocabulary(String name) { + this.navigatorName = name; + this.tag2print = new ConcurrentHashMap(); + this.print2tag = new ConcurrentHashMap(); + } + + public Vocabulary(String name, File propFile) throws IOException { + this(name); + ArrayList list = FileUtils.getListArray(propFile); + String k, v; + String[] tags; + int p; + vocloop: for (String line: list) { + line = line.trim(); + if (line.length() == 0 || line.charAt(0) == '#') { + continue vocloop; + } + p = line.indexOf(':'); + if (p < 0) { + p = line.indexOf('='); + } + if (p < 0) { + p = line.indexOf('\t'); + } + if (p < 0) { + this.tag2print.put(line, line); + this.print2tag.put(line, line); + continue vocloop; + } + k = line.substring(0, p).trim(); + v = line.substring(p + 1); + tags = v.split(","); + tagloop: for (String t: tags) { + t = t.trim().toLowerCase(); + if (t.length() == 0) { + continue tagloop; + } + this.tag2print.put(t, k); + this.print2tag.put(k, t); + } + this.tag2print.put(k.toLowerCase(), k); + this.print2tag.put(k, k.toLowerCase()); + } + } + + public Vocabulary(String name, Localization localization) { + this(name); + Set locNames = localization.locationNames(); + for (String loc: locNames) { + this.tag2print.put(loc.toLowerCase(), loc); + this.print2tag.put(loc, loc.toLowerCase()); + } + } + + public String getName() { + return this.navigatorName; + } + + public String getPrint(final String tag) { + return this.tag2print.get(tag); + } + + public String getTag(final String print) { + return this.print2tag.get(print); + } + + public Set tags() { + return this.tag2print.keySet(); + } + + @Override + public String toString() { + return this.print2tag.toString(); + } + } + + public class Metatag { + private final String vocName; + private final String print; + public Metatag(String vocName, String print) { + this.vocName = vocName; + this.print = print; + } + public Metatag(String metatag) { + assert metatag.charAt(0) == Autotagging.this.prefixChar; + int p = metatag.indexOf(':'); + assert p > 0; + this.vocName = metatag.substring(1, p); + this.print = metatag.substring(p + 1); + } + public String getVocabularyName() { + return this.vocName; + } + public String getPrintName() { + return this.print; + } + public String getMetatag() { + return Autotagging.this.prefixChar + this.vocName + ":" + this.print.replaceAll(" ", "_"); + } + } + + public Metatag metatag(String vocName, String print) { + return new Metatag(vocName, print); + } + + public Metatag metatag(String metatag) { + return new Metatag(metatag); + } + + public static void main(String[] args) { + Autotagging a = new Autotagging(new File("DATA/DICTIONARIES/" + LibraryProvider.path_to_autotagging_dictionaries), '$'); + for (Map.Entry entry: a.vocabularies.entrySet()) { + System.out.println(entry); + } + } + +} diff --git a/source/net/yacy/document/LibraryProvider.java b/source/net/yacy/document/LibraryProvider.java index de27f3f0f..e9b4aca6c 100644 --- a/source/net/yacy/document/LibraryProvider.java +++ b/source/net/yacy/document/LibraryProvider.java @@ -48,12 +48,15 @@ import net.yacy.kelondro.util.FileUtils; public class LibraryProvider { - private static final String path_to_source_dictionaries = "source"; - private static final String path_to_did_you_mean_dictionaries = "didyoumean"; + public static final char tagPrefix = '$'; + public static final String path_to_source_dictionaries = "source"; + public static final String path_to_did_you_mean_dictionaries = "didyoumean"; + public static final String path_to_autotagging_dictionaries = "autotagging"; public static final String disabledExtension = ".disabled"; public static WordCache dymLib = new WordCache(null); + public static Autotagging autotagging = new Autotagging(null, tagPrefix); public static OverarchingLocalization geoLoc = new OverarchingLocalization(); private static File dictSource = null; private static File dictRoot = null; @@ -91,7 +94,7 @@ public class LibraryProvider * initialize the LibraryProvider as static class. This assigns default paths, and initializes the * dictionary classes Additionally, if default dictionaries are given in the source path, they are * translated into the input format inside the DATA/DICTIONARIES directory - * + * * @param pathToSource * @param pathToDICTIONARIES */ @@ -107,6 +110,8 @@ public class LibraryProvider initDidYouMean(); integrateOpenGeoDB(); integrateGeonames(); + initAutotagging(tagPrefix); + autotagging.addLocalization(geoLoc); } public static void integrateOpenGeoDB() { @@ -141,6 +146,14 @@ public class LibraryProvider dymLib = new WordCache(dymDict); } + public static void initAutotagging(char prefix) { + final File autotaggingPath = new File(dictRoot, path_to_autotagging_dictionaries); + if ( !autotaggingPath.exists() ) { + autotaggingPath.mkdirs(); + } + autotagging = new Autotagging(autotaggingPath, prefix); + } + public static void removeDeReWo() { final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries); final File derewoInput = LibraryProvider.Dictionary.DRW0.file(); diff --git a/source/net/yacy/document/geolocalization/GeonamesLocalization.java b/source/net/yacy/document/geolocalization/GeonamesLocalization.java index 735816983..f23ea4f54 100644 --- a/source/net/yacy/document/geolocalization/GeonamesLocalization.java +++ b/source/net/yacy/document/geolocalization/GeonamesLocalization.java @@ -165,6 +165,20 @@ public class GeonamesLocalization implements Localization return a; } + /** + * produce a set of location names + * @return a set of names + */ + @Override + public Set locationNames() { + Set locations = new HashSet(); + Set l = this.name2ids.keySet(); + for (StringBuilder s: l) { + locations.add(s.toString()); + } + return locations; + } + @Override public Set recommend(final String s) { final Set a = new HashSet(); diff --git a/source/net/yacy/document/geolocalization/Localization.java b/source/net/yacy/document/geolocalization/Localization.java index 130cee536..3008c4d70 100644 --- a/source/net/yacy/document/geolocalization/Localization.java +++ b/source/net/yacy/document/geolocalization/Localization.java @@ -47,12 +47,19 @@ public interface Localization { */ public TreeSet find(String anyname, boolean locationexact); + /** + * produce a set of location names + * @return a set of names + */ + public Set locationNames(); + /** * recommend a set of names according to a given name * @param s a possibly partially matching name * @return a set of names that match with the given name using the local dictionary of names */ public Set recommend(String s); + /** * recommend a set of names according to a given name * @param s a possibly partially matching name @@ -70,6 +77,7 @@ public interface Localization { * hashCode that must be used to distinguish localization services in hash sets * @return the hash code, may be derived from the nickname */ + @Override public int hashCode(); /** @@ -77,5 +85,6 @@ public interface Localization { * @param other * @return true if both objects are localization services and have the same nickname */ + @Override public boolean equals(Object other); } diff --git a/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java b/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java index fbf2aeadb..f0f891e8e 100644 --- a/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java +++ b/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java @@ -195,7 +195,7 @@ public class OpenGeoDBLocalization implements Localization /** * check database tables against occurrences of this entity the anyname - String may be one of: - name of * a town, villa, region etc - zip code - telephone prefix - kfz sign - * + * * @param anyname * @return */ @@ -241,9 +241,23 @@ public class OpenGeoDBLocalization implements Localization return a; } + /** + * produce a set of location names + * @return a set of names + */ + @Override + public Set locationNames() { + Set locations = new HashSet(); + Set l = this.name2ids.keySet(); + for (StringBuilder s: l) { + locations.add(s.toString()); + } + return locations; + } + /** * read the dictionary and construct a set of recommendations to a given string - * + * * @param s input value that is used to match recommendations * @return a set that contains all words that start with the input value */ diff --git a/source/net/yacy/document/geolocalization/OverarchingLocalization.java b/source/net/yacy/document/geolocalization/OverarchingLocalization.java index fbf704fe9..93a8183ba 100644 --- a/source/net/yacy/document/geolocalization/OverarchingLocalization.java +++ b/source/net/yacy/document/geolocalization/OverarchingLocalization.java @@ -56,6 +56,11 @@ public class OverarchingLocalization implements Localization { this.services.remove(nickname); } + /** + * the number of locations that this localization stores + * @return the number of locations + */ + @Override public int locations() { int locations = 0; for (final Localization service: this.services.values()) { @@ -65,8 +70,12 @@ public class OverarchingLocalization implements Localization { } /** - * find (a set of) locations + * find a location by name + * @param anyname - a name of a location + * @param locationexact - if true, then only exact matched with the location are returned. if false also partially matching names + * @return a set of locations, ordered by population (if this information is given) */ + @Override public TreeSet find(final String anyname, final boolean locationexact) { final TreeSet locations = new TreeSet(); for (final Localization service: this.services.values()) { @@ -76,36 +85,80 @@ public class OverarchingLocalization implements Localization { } /** - * recommend location names + * produce a set of location names + * @return a set of names + */ + @Override + public Set locationNames() { + final Set locations = new HashSet(); + for (final Localization service: this.services.values()) { + locations.addAll(service.locationNames()); + } + return locations; + } + + /** + * recommend a set of names according to a given name + * @param s a possibly partially matching name + * @return a set of names that match with the given name using the local dictionary of names */ + @Override public Set recommend(final String s) { final Set recommendations = new HashSet(); - if (s.length() == 0) return recommendations; + if (s.length() == 0) { + return recommendations; + } for (final Localization service: this.services.values()) { recommendations.addAll(service.recommend(s)); } return recommendations; } + /** + * recommend a set of names according to a given name + * @param s a possibly partially matching name + * @return a set of names that match with the given name using the local dictionary of names + */ + @Override public Set recommend(final StringBuilder s) { final Set recommendations = new HashSet(); - if (s.length() == 0) return recommendations; + if (s.length() == 0) { + return recommendations; + } for (final Localization service: this.services.values()) { recommendations.addAll(service.recommend(s)); } return recommendations; } + /** + * return an nickname of the localization service + * @return the nickname + */ + @Override public String nickname() { return "oa"; } + /** + * hashCode that must be used to distinguish localization services in hash sets + * @return the hash code, may be derived from the nickname + */ + @Override public int hashCode() { return nickname().hashCode(); } + /** + * compare localization services; to be used for hash sets with localization services + * @param other + * @return true if both objects are localization services and have the same nickname + */ + @Override public boolean equals(final Object other) { - if (!(other instanceof Localization)) return false; + if (!(other instanceof Localization)) { + return false; + } return nickname().equals(((Localization) other).nickname()); }