added autotaggig stub .. only reading and parsing of vocabularies at

this time
pull/1/head
Michael Christen 13 years ago
parent 2ee8cbeb2c
commit bd40a10230

@ -57,7 +57,9 @@ public class DictionaryLoader_p {
prop.put(dictionary.nickname + "ActionDeactivated", 0);
}
if (post == null) return prop;
if (post == null) {
return prop;
}
// GEON0
if (post.containsKey("geon0Load")) {
@ -67,6 +69,7 @@ public class DictionaryLoader_p {
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file()));
LibraryProvider.autotagging.addLocalization(LibraryProvider.geoLoc);
prop.put("geon0Status", LibraryProvider.Dictionary.GEON0.file().exists() ? 1 : 0);
prop.put("geon0ActionLoaded", 1);
} catch (final MalformedURLException e) {
@ -96,6 +99,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon0Activate")) {
LibraryProvider.Dictionary.GEON0.fileDisabled().renameTo(LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file()));
LibraryProvider.autotagging.addLocalization(LibraryProvider.geoLoc);
prop.put("geon0ActionActivated", 1);
}
@ -108,6 +112,7 @@ public class DictionaryLoader_p {
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.removeLocalization(LibraryProvider.Dictionary.GEODB0.nickname);
LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEODB1.nickname, new OpenGeoDBLocalization(LibraryProvider.Dictionary.GEODB1.file(), false));
LibraryProvider.autotagging.addLocalization(LibraryProvider.geoLoc);
prop.put("geo1Status", LibraryProvider.Dictionary.GEODB1.file().exists() ? 1 : 0);
prop.put("geo1ActionLoaded", 1);
} catch (final MalformedURLException e) {
@ -137,6 +142,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geo1Activate")) {
LibraryProvider.Dictionary.GEODB1.fileDisabled().renameTo(LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEODB1.nickname, new OpenGeoDBLocalization(LibraryProvider.Dictionary.GEODB1.file(), false));
LibraryProvider.autotagging.addLocalization(LibraryProvider.geoLoc);
prop.put("geo1ActionActivated", 1);
}

@ -0,0 +1,238 @@
/**
* Autotagging
* Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 07.01.2012 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.document.geolocalization.Localization;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
/**
* Autotagging provides a set of tag/print-name properties which can be used to
* - create tags from texts automatically
* - create navigation entries for given tags
*/
public class Autotagging {
final static Object PRESENT = new Object();
final char prefixChar;
final File autotaggingPath;
final Map<String, Vocabulary> vocabularies;
final Map<String, Object> allTags;
public Autotagging(final File autotaggingPath, char prefixChar) {
this.vocabularies = new ConcurrentHashMap<String, Vocabulary>();
this.autotaggingPath = autotaggingPath;
this.prefixChar = prefixChar;
this.allTags = new ConcurrentHashMap<String, Object>();
reload();
}
/**
* scan the input directory and load all tag tables (again)
* a tag table is a property file where
* the key is the tag name
* the value is the visible name for the tag (shown in a navigator)
* properties without values are allowed (the value is then set to the key)
* also the value can be used as a tag
*/
public void reload() {
this.vocabularies.clear();
this.allTags.clear();
if (this.autotaggingPath == null || !this.autotaggingPath.exists()) {
return;
}
final String[] files = this.autotaggingPath.list();
for (final String f: files) {
if (f.endsWith(".vocabulary")) {
try {
File ff = new File(this.autotaggingPath, f);
String vocName = ff.getName();
vocName = vocName.substring(0, vocName.length() - 11);
Vocabulary voc = new Vocabulary(vocName, ff);
this.vocabularies.put(vocName, voc);
for (String t: voc.tags()) {
this.allTags.put(t, PRESENT);
}
} catch (final IOException e) {
Log.logException(e);
}
}
}
}
/*
public void addDidYouMean(WordCache wordCache) {
}
*/
public void addLocalization(Localization localization) {
Vocabulary voc = new Vocabulary("Locale", localization);
this.vocabularies.put("Locale", voc);
for (String t: voc.tags()) {
this.allTags.put(t, PRESENT);
}
}
/**
* produce a set of tags for a given text.
* The set contains the names of the tags with a prefix character at the front
* @param text
* @return
*/
public Set<String> tags(String text) {
Set<String> as = new HashSet<String>();
return as;
}
public static class Vocabulary {
final String navigatorName;
final Map<String, String> tag2print, print2tag;
public Vocabulary(String name) {
this.navigatorName = name;
this.tag2print = new ConcurrentHashMap<String, String>();
this.print2tag = new ConcurrentHashMap<String, String>();
}
public Vocabulary(String name, File propFile) throws IOException {
this(name);
ArrayList<String> list = FileUtils.getListArray(propFile);
String k, v;
String[] tags;
int p;
vocloop: for (String line: list) {
line = line.trim();
if (line.length() == 0 || line.charAt(0) == '#') {
continue vocloop;
}
p = line.indexOf(':');
if (p < 0) {
p = line.indexOf('=');
}
if (p < 0) {
p = line.indexOf('\t');
}
if (p < 0) {
this.tag2print.put(line, line);
this.print2tag.put(line, line);
continue vocloop;
}
k = line.substring(0, p).trim();
v = line.substring(p + 1);
tags = v.split(",");
tagloop: for (String t: tags) {
t = t.trim().toLowerCase();
if (t.length() == 0) {
continue tagloop;
}
this.tag2print.put(t, k);
this.print2tag.put(k, t);
}
this.tag2print.put(k.toLowerCase(), k);
this.print2tag.put(k, k.toLowerCase());
}
}
public Vocabulary(String name, Localization localization) {
this(name);
Set<String> locNames = localization.locationNames();
for (String loc: locNames) {
this.tag2print.put(loc.toLowerCase(), loc);
this.print2tag.put(loc, loc.toLowerCase());
}
}
public String getName() {
return this.navigatorName;
}
public String getPrint(final String tag) {
return this.tag2print.get(tag);
}
public String getTag(final String print) {
return this.print2tag.get(print);
}
public Set<String> tags() {
return this.tag2print.keySet();
}
@Override
public String toString() {
return this.print2tag.toString();
}
}
public class Metatag {
private final String vocName;
private final String print;
public Metatag(String vocName, String print) {
this.vocName = vocName;
this.print = print;
}
public Metatag(String metatag) {
assert metatag.charAt(0) == Autotagging.this.prefixChar;
int p = metatag.indexOf(':');
assert p > 0;
this.vocName = metatag.substring(1, p);
this.print = metatag.substring(p + 1);
}
public String getVocabularyName() {
return this.vocName;
}
public String getPrintName() {
return this.print;
}
public String getMetatag() {
return Autotagging.this.prefixChar + this.vocName + ":" + this.print.replaceAll(" ", "_");
}
}
public Metatag metatag(String vocName, String print) {
return new Metatag(vocName, print);
}
public Metatag metatag(String metatag) {
return new Metatag(metatag);
}
public static void main(String[] args) {
Autotagging a = new Autotagging(new File("DATA/DICTIONARIES/" + LibraryProvider.path_to_autotagging_dictionaries), '$');
for (Map.Entry<String, Vocabulary> entry: a.vocabularies.entrySet()) {
System.out.println(entry);
}
}
}

@ -48,12 +48,15 @@ import net.yacy.kelondro.util.FileUtils;
public class LibraryProvider
{
private static final String path_to_source_dictionaries = "source";
private static final String path_to_did_you_mean_dictionaries = "didyoumean";
public static final char tagPrefix = '$';
public static final String path_to_source_dictionaries = "source";
public static final String path_to_did_you_mean_dictionaries = "didyoumean";
public static final String path_to_autotagging_dictionaries = "autotagging";
public static final String disabledExtension = ".disabled";
public static WordCache dymLib = new WordCache(null);
public static Autotagging autotagging = new Autotagging(null, tagPrefix);
public static OverarchingLocalization geoLoc = new OverarchingLocalization();
private static File dictSource = null;
private static File dictRoot = null;
@ -91,7 +94,7 @@ public class LibraryProvider
* initialize the LibraryProvider as static class. This assigns default paths, and initializes the
* dictionary classes Additionally, if default dictionaries are given in the source path, they are
* translated into the input format inside the DATA/DICTIONARIES directory
*
*
* @param pathToSource
* @param pathToDICTIONARIES
*/
@ -107,6 +110,8 @@ public class LibraryProvider
initDidYouMean();
integrateOpenGeoDB();
integrateGeonames();
initAutotagging(tagPrefix);
autotagging.addLocalization(geoLoc);
}
public static void integrateOpenGeoDB() {
@ -141,6 +146,14 @@ public class LibraryProvider
dymLib = new WordCache(dymDict);
}
public static void initAutotagging(char prefix) {
final File autotaggingPath = new File(dictRoot, path_to_autotagging_dictionaries);
if ( !autotaggingPath.exists() ) {
autotaggingPath.mkdirs();
}
autotagging = new Autotagging(autotaggingPath, prefix);
}
public static void removeDeReWo() {
final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries);
final File derewoInput = LibraryProvider.Dictionary.DRW0.file();

@ -165,6 +165,20 @@ public class GeonamesLocalization implements Localization
return a;
}
/**
* produce a set of location names
* @return a set of names
*/
@Override
public Set<String> locationNames() {
Set<String> locations = new HashSet<String>();
Set<StringBuilder> l = this.name2ids.keySet();
for (StringBuilder s: l) {
locations.add(s.toString());
}
return locations;
}
@Override
public Set<String> recommend(final String s) {
final Set<String> a = new HashSet<String>();

@ -47,12 +47,19 @@ public interface Localization {
*/
public TreeSet<Location> find(String anyname, boolean locationexact);
/**
* produce a set of location names
* @return a set of names
*/
public Set<String> locationNames();
/**
* recommend a set of names according to a given name
* @param s a possibly partially matching name
* @return a set of names that match with the given name using the local dictionary of names
*/
public Set<String> recommend(String s);
/**
* recommend a set of names according to a given name
* @param s a possibly partially matching name
@ -70,6 +77,7 @@ public interface Localization {
* hashCode that must be used to distinguish localization services in hash sets
* @return the hash code, may be derived from the nickname
*/
@Override
public int hashCode();
/**
@ -77,5 +85,6 @@ public interface Localization {
* @param other
* @return true if both objects are localization services and have the same nickname
*/
@Override
public boolean equals(Object other);
}

@ -195,7 +195,7 @@ public class OpenGeoDBLocalization implements Localization
/**
* check database tables against occurrences of this entity the anyname - String may be one of: - name of
* a town, villa, region etc - zip code - telephone prefix - kfz sign
*
*
* @param anyname
* @return
*/
@ -241,9 +241,23 @@ public class OpenGeoDBLocalization implements Localization
return a;
}
/**
* produce a set of location names
* @return a set of names
*/
@Override
public Set<String> locationNames() {
Set<String> locations = new HashSet<String>();
Set<StringBuilder> l = this.name2ids.keySet();
for (StringBuilder s: l) {
locations.add(s.toString());
}
return locations;
}
/**
* read the dictionary and construct a set of recommendations to a given string
*
*
* @param s input value that is used to match recommendations
* @return a set that contains all words that start with the input value
*/

@ -56,6 +56,11 @@ public class OverarchingLocalization implements Localization {
this.services.remove(nickname);
}
/**
* the number of locations that this localization stores
* @return the number of locations
*/
@Override
public int locations() {
int locations = 0;
for (final Localization service: this.services.values()) {
@ -65,8 +70,12 @@ public class OverarchingLocalization implements Localization {
}
/**
* find (a set of) locations
* find a location by name
* @param anyname - a name of a location
* @param locationexact - if true, then only exact matched with the location are returned. if false also partially matching names
* @return a set of locations, ordered by population (if this information is given)
*/
@Override
public TreeSet<Location> find(final String anyname, final boolean locationexact) {
final TreeSet<Location> locations = new TreeSet<Location>();
for (final Localization service: this.services.values()) {
@ -76,36 +85,80 @@ public class OverarchingLocalization implements Localization {
}
/**
* recommend location names
* produce a set of location names
* @return a set of names
*/
@Override
public Set<String> locationNames() {
final Set<String> locations = new HashSet<String>();
for (final Localization service: this.services.values()) {
locations.addAll(service.locationNames());
}
return locations;
}
/**
* recommend a set of names according to a given name
* @param s a possibly partially matching name
* @return a set of names that match with the given name using the local dictionary of names
*/
@Override
public Set<String> recommend(final String s) {
final Set<String> recommendations = new HashSet<String>();
if (s.length() == 0) return recommendations;
if (s.length() == 0) {
return recommendations;
}
for (final Localization service: this.services.values()) {
recommendations.addAll(service.recommend(s));
}
return recommendations;
}
/**
* recommend a set of names according to a given name
* @param s a possibly partially matching name
* @return a set of names that match with the given name using the local dictionary of names
*/
@Override
public Set<StringBuilder> recommend(final StringBuilder s) {
final Set<StringBuilder> recommendations = new HashSet<StringBuilder>();
if (s.length() == 0) return recommendations;
if (s.length() == 0) {
return recommendations;
}
for (final Localization service: this.services.values()) {
recommendations.addAll(service.recommend(s));
}
return recommendations;
}
/**
* return an nickname of the localization service
* @return the nickname
*/
@Override
public String nickname() {
return "oa";
}
/**
* hashCode that must be used to distinguish localization services in hash sets
* @return the hash code, may be derived from the nickname
*/
@Override
public int hashCode() {
return nickname().hashCode();
}
/**
* compare localization services; to be used for hash sets with localization services
* @param other
* @return true if both objects are localization services and have the same nickname
*/
@Override
public boolean equals(final Object other) {
if (!(other instanceof Localization)) return false;
if (!(other instanceof Localization)) {
return false;
}
return nickname().equals(((Localization) other).nickname());
}

Loading…
Cancel
Save