- added downloader option in DictionaryLoader - added generalization (interfaces and overarching localization) - more abstraction using the libraries git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6879 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
c9862e0ca9
commit
e43e61e502
@ -0,0 +1,169 @@
|
||||
/**
|
||||
* GeonamesLocalization.java
|
||||
* Copyright 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
* first published 16.05.2010 on http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.geolocalization;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.text.Collator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
public class GeonamesLocalization implements Localization {
|
||||
|
||||
/*
|
||||
The main 'geoname' table has the following fields :
|
||||
---------------------------------------------------
|
||||
geonameid : integer id of record in geonames database
|
||||
name : name of geographical point (utf8) varchar(200)
|
||||
asciiname : name of geographical point in plain ascii characters, varchar(200)
|
||||
alternatenames : alternatenames, comma separated varchar(5000)
|
||||
latitude : latitude in decimal degrees (wgs84)
|
||||
longitude : longitude in decimal degrees (wgs84)
|
||||
feature class : see http://www.geonames.org/export/codes.html, char(1)
|
||||
feature code : see http://www.geonames.org/export/codes.html, varchar(10)
|
||||
country code : ISO-3166 2-letter country code, 2 characters
|
||||
cc2 : alternate country codes, comma separated, ISO-3166 2-letter country code, 60 characters
|
||||
admin1 code : fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)
|
||||
admin2 code : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)
|
||||
admin3 code : code for third level administrative division, varchar(20)
|
||||
admin4 code : code for fourth level administrative division, varchar(20)
|
||||
population : bigint (8 byte int)
|
||||
elevation : in meters, integer
|
||||
gtopo30 : average elevation of 30'x30' (ca 900mx900m) area in meters, integer
|
||||
timezone : the timezone id (see file timeZone.txt)
|
||||
modification date : date of last modification in yyyy-MM-dd format
|
||||
*/
|
||||
|
||||
// use a collator to relax when distinguishing between lowercase und uppercase letters
|
||||
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
|
||||
static {
|
||||
insensitiveCollator.setStrength(Collator.SECONDARY);
|
||||
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
|
||||
}
|
||||
|
||||
private final HashMap<Integer, Location> id2loc;
|
||||
private final TreeMap<String, List<Integer>> name2ids;
|
||||
private final File file;
|
||||
|
||||
public GeonamesLocalization(final File file) {
|
||||
// this is a processing of the cities1000.zip file from http://download.geonames.org/export/dump/
|
||||
|
||||
this.file = file;
|
||||
this.id2loc = new HashMap<Integer, Location>();
|
||||
this.name2ids = new TreeMap<String, List<Integer>>(insensitiveCollator);
|
||||
|
||||
if (file == null || !file.exists()) return;
|
||||
BufferedReader reader;
|
||||
try {
|
||||
ZipFile zf = new ZipFile(file);
|
||||
ZipEntry ze = zf.getEntry("cities1000.txt");
|
||||
InputStream is = zf.getInputStream(ze);
|
||||
reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
return;
|
||||
}
|
||||
|
||||
// when an error occurs after this line, just accept it and work on
|
||||
try {
|
||||
String line;
|
||||
String[] fields;
|
||||
Set<String> locnames;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.length() == 0) continue;
|
||||
fields = line.split("\t");
|
||||
int id = Integer.parseInt(fields[0]);
|
||||
locnames = new HashSet<String>();
|
||||
locnames.add(fields[1]);
|
||||
locnames.add(fields[2]);
|
||||
for (String s: fields[3].split(",")) locnames.add(s);
|
||||
Location c = new Location(Double.parseDouble(fields[5]), Double.parseDouble(fields[4]), fields[1]);
|
||||
this.id2loc.put(id, c);
|
||||
for (String name: locnames) {
|
||||
List<Integer> locs = this.name2ids.get(name);
|
||||
if (locs == null) locs = new ArrayList<Integer>(1);
|
||||
locs.add(id);
|
||||
this.name2ids.put(name, locs);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public Set<Location> find(String anyname, boolean locationexact) {
|
||||
HashSet<Integer> r = new HashSet<Integer>();
|
||||
List<Integer> c;
|
||||
if (locationexact) {
|
||||
c = this.name2ids.get(anyname); if (c != null) r.addAll(c);
|
||||
} else {
|
||||
SortedMap<String, List<Integer>> cities = this.name2ids.tailMap(anyname);
|
||||
for (Map.Entry<String, List<Integer>> e: cities.entrySet()) {
|
||||
if (e.getKey().toLowerCase().startsWith(anyname.toLowerCase())) r.addAll(e.getValue()); else break;
|
||||
}
|
||||
}
|
||||
HashSet<Location> a = new HashSet<Location>();
|
||||
for (Integer e: r) {
|
||||
Location w = this.id2loc.get(e);
|
||||
if (w != null) a.add(w);
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
public Set<String> recommend(String s) {
|
||||
Set<String> a = new HashSet<String>();
|
||||
s = s.trim().toLowerCase();
|
||||
SortedMap<String, List<Integer>> t = this.name2ids.tailMap(s);
|
||||
for (String r: t.keySet()) {
|
||||
if (r.startsWith(s)) a.add(r); else break;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
public String nickname() {
|
||||
return this.file.getName();
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.nickname().hashCode();
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (!(other instanceof Localization)) return false;
|
||||
return this.nickname().equals(((Localization) other).nickname());
|
||||
}
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
/**
|
||||
* Localization.java
|
||||
* Copyright 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
* first published 16.05.2010 on http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package net.yacy.document.geolocalization;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* localization interface
|
||||
* @author Michael Peter Christen
|
||||
*
|
||||
*/
|
||||
public interface Localization {
|
||||
|
||||
/**
|
||||
* find a location by name
|
||||
* @param anyname - a name of a location
|
||||
* @param locationexact - if true, then only exact matched with the location are returned. if false also partially matching names
|
||||
* @return a set of locations
|
||||
*/
|
||||
public Set<Location> find(String anyname, boolean locationexact);
|
||||
|
||||
/**
|
||||
* recommend a set of names according to a given name
|
||||
* @param s a possibly partially matching name
|
||||
* @return a set of names that match with the given name using the local dictionary of names
|
||||
*/
|
||||
public Set<String> recommend(String s);
|
||||
|
||||
/**
|
||||
* return an nickname of the localization service
|
||||
* @return the nickname
|
||||
*/
|
||||
public String nickname();
|
||||
|
||||
/**
|
||||
* hashCode that must be used to distinuguish localization services in hash sets
|
||||
* @return the hash code, may be derived from the nickname
|
||||
*/
|
||||
public int hashCode();
|
||||
|
||||
/**
|
||||
* compare localization services; to be used for hash sets with localization services
|
||||
* @param other
|
||||
* @return true if both objects are localization services and have the same nickname
|
||||
*/
|
||||
public boolean equals(Object other);
|
||||
}
|
@ -0,0 +1,93 @@
|
||||
/**
|
||||
* OverarchingLocalization.java
|
||||
* Copyright 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
* first published 16.05.2010 on http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.geolocalization;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
public class OverarchingLocalization implements Localization {
|
||||
|
||||
private Map<String, Localization> services;
|
||||
|
||||
/**
|
||||
* create a new overarching localization object
|
||||
*/
|
||||
public OverarchingLocalization() {
|
||||
this.services = new HashMap<String, Localization>();
|
||||
}
|
||||
|
||||
/**
|
||||
* add a localization service
|
||||
* @param nickname the nickname of the service
|
||||
* @param service the service
|
||||
*/
|
||||
public void addLocalization(String nickname, Localization service) {
|
||||
this.services.put(nickname, service);
|
||||
}
|
||||
|
||||
/**
|
||||
* remove a localization service
|
||||
* @param nickname
|
||||
*/
|
||||
public void removeLocalization(String nickname) {
|
||||
this.services.remove(nickname);
|
||||
}
|
||||
|
||||
/**
|
||||
* find (a set of) locations
|
||||
*/
|
||||
public Set<Location> find(String anyname, boolean locationexact) {
|
||||
Set<Location> locations = new HashSet<Location>();
|
||||
for (Localization service: this.services.values()) {
|
||||
locations.addAll(service.find(anyname, locationexact));
|
||||
}
|
||||
return locations;
|
||||
}
|
||||
|
||||
/**
|
||||
* recommend location names
|
||||
*/
|
||||
public Set<String> recommend(String s) {
|
||||
Set<String> recommendations = new HashSet<String>();
|
||||
for (Localization service: this.services.values()) {
|
||||
recommendations.addAll(service.recommend(s));
|
||||
}
|
||||
return recommendations;
|
||||
}
|
||||
|
||||
public String nickname() {
|
||||
return "oa";
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.nickname().hashCode();
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (!(other instanceof Localization)) return false;
|
||||
return this.nickname().equals(((Localization) other).nickname());
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue