@ -25,20 +25,25 @@ package net.yacy.document;
import java.io.BufferedReader ;
import java.io.BufferedWriter ;
import java.io.File ;
import java.io.FileInputStream ;
import java.io.FileWriter ;
import java.io.IOException ;
import java.io.InputStream ;
import java.io.InputStreamReader ;
import java.io.PrintWriter ;
import java.net.MalformedURLException ;
import java.util.ArrayList ;
import java.util.List ;
import java.util.Set ;
import java.util.TreeSet ;
import java.util.zip.ZipException ;
import java.util.zip.ZipFile ;
import net.yacy.cora.document.MultiProtocolURI ;
import net.yacy.document.geolocalization.GeonamesLocalization ;
import net.yacy.document.geolocalization.OpenGeoDBLocalization ;
import net.yacy.document.geolocalization.OverarchingLocalization ;
import net.yacy.kelondro.logging.Log ;
import net.yacy.kelondro.util.FileUtils ;
public class LibraryProvider {
@ -53,28 +58,27 @@ public class LibraryProvider {
private static File dictRoot = null ;
public static enum Dictionary {
GEODB0 ( "geo0" ,
"http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz" ,
"opengeodb-0.2.5a-UTF8-sql.gz" ) ,
GEODB1 ( "geo1" ,
"http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02621_2010-03-16.sql.gz" ,
"opengeodb-02621_2010-03-16.sql.gz" ) ,
GEON0 ( "geon0" ,
"http://download.geonames.org/export/dump/cities1000.zip" ,
"cities1000.zip" ) ;
GEODB0 ( "geo0" , "http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz" ) ,
GEODB1 ( "geo1" , "http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02621_2010-03-16.sql.gz" ) ,
GEON0 ( "geon0" , "http://download.geonames.org/export/dump/cities1000.zip" ) ,
DRW0 ( "drw0" , "http://www.ids-mannheim.de/kl/derewo/derewo-v-100000t-2009-04-30-0.1.zip" ) ;
public String nickname , url , filename ;
private Dictionary ( String nickname , String url , String filename ) {
private Dictionary ( final String nickname , final String url ) {
try {
this . filename = new MultiProtocolURI ( url ) . getFileName ( ) ;
} catch ( final MalformedURLException e ) {
assert false ;
}
this . nickname = nickname ;
this . url = url ;
this . filename = filename ;
}
public File file ( ) {
return new File ( dictSource , filename) ;
return new File ( dictSource , this . filename) ;
}
public File fileDisabled ( ) {
return new File ( dictSource , filename + disabledExtension ) ;
return new File ( dictSource , this . filename + disabledExtension ) ;
}
}
@ -100,8 +104,8 @@ public class LibraryProvider {
}
public static void integrateOpenGeoDB ( ) {
File geo1 = Dictionary . GEODB1 . file ( ) ;
File geo0 = Dictionary . GEODB0 . file ( ) ;
final File geo1 = Dictionary . GEODB1 . file ( ) ;
final File geo0 = Dictionary . GEODB0 . file ( ) ;
if ( geo1 . exists ( ) ) {
if ( geo0 . exists ( ) ) geo0 . renameTo ( Dictionary . GEODB0 . fileDisabled ( ) ) ;
geoLoc . addLocalization ( Dictionary . GEODB1 . nickname , new OpenGeoDBLocalization ( geo1 , false ) ) ;
@ -114,7 +118,7 @@ public class LibraryProvider {
}
public static void integrateGeonames ( ) {
File geon = Dictionary . GEON0 . file ( ) ;
final File geon = Dictionary . GEON0 . file ( ) ;
if ( geon . exists ( ) ) {
geoLoc . addLocalization ( Dictionary . GEON0 . nickname , new GeonamesLocalization ( geon ) ) ;
return ;
@ -127,19 +131,25 @@ public class LibraryProvider {
dymLib = new WordCache ( dymDict ) ;
}
public static void removeDeReWo ( ) {
final File dymDict = new File ( dictRoot , path_to_did_you_mean_dictionaries ) ;
final File derewoInput = LibraryProvider . Dictionary . DRW0 . file ( ) ;
final File derewoOutput = new File ( dymDict , derewoInput . getName ( ) + ".words" ) ;
FileUtils . deletedelete ( derewoOutput ) ;
}
public static void integrateDeReWo ( ) {
// translate input files (once..)
final File dymDict = new File ( dictRoot , path_to_did_you_mean_dictionaries ) ;
if ( ! dymDict . exists ( ) ) dymDict . mkdirs ( ) ;
final File pathToSource = new File ( dictRoot , path_to_source_dictionaries ) ;
final File derewoInput = new File ( pathToSource , "derewo-v-30000g-2007-12-31-0.1.txt" ) ;
final File derewoOutput = new File ( dymDict , "derewo-v-30000g-2007-12-31-0.1.words" ) ;
final File derewoInput = LibraryProvider . Dictionary . DRW0 . file ( ) ;
final File derewoOutput = new File ( dymDict , derewoInput . getName ( ) + ".words" ) ;
if ( ! derewoOutput . exists ( ) & & derewoInput . exists ( ) ) {
// create the translation of the derewo file (which is easy in this case)
final ArrayList < String > derewo = loadDeReWo ( derewoInput , true ) ;
try {
writeWords ( derewoOutput , derewo ) ;
} catch ( IOException e ) {
} catch ( final IOException e ) {
Log . logException ( e ) ;
}
}
@ -184,28 +194,52 @@ public class LibraryProvider {
private static ArrayList < String > loadDeReWo ( final File file , final boolean toLowerCase ) {
final ArrayList < String > list = new ArrayList < String > ( ) ;
// get the zip file entry from the file
InputStream derewoTxtEntry ;
try {
final ZipFile zip = new ZipFile ( file ) ;
/ *
final Enumeration < ? extends ZipEntry > i = zip . entries ( ) ;
while ( i . hasMoreElements ( ) ) {
final ZipEntry e = i . nextElement ( ) ;
System . out . println ( "loadDeReWo: " + e . getName ( ) ) ;
}
* /
derewoTxtEntry = zip . getInputStream ( zip . getEntry ( "derewo-v-100000t-2009-04-30-0.1" ) ) ;
} catch ( final ZipException e ) {
Log . logException ( e ) ;
return list ;
} catch ( final IOException e ) {
Log . logException ( e ) ;
return list ;
}
BufferedReader reader = null ;
try {
reader = new BufferedReader ( new InputStreamReader ( new FileInputStream ( file ) , "UTF-8" ) ) ;
reader = new BufferedReader ( new InputStreamReader ( derewoTxtEntry , "UTF-8" ) ) ;
String line ;
// read until text starts
while ( ( line = reader . readLine ( ) ) ! = null ) {
if ( line . startsWith ( "-----" ) ) break ;
if ( line . startsWith ( " # -----") ) break ;
}
// read empty line
line = reader . readLine ( ) ;
// read lines
int p ;
int c ;
//int c;
String w ;
while ( ( line = reader . readLine ( ) ) ! = null ) {
line = line . trim ( ) ;
p = line . indexOf ( "\t" ) ;
p = line . indexOf ( " ") ;
if ( p > 0 ) {
c = Integer . parseInt ( line . substring ( p + 1 ) ) ;
if ( c < 1 ) continue ;
list . add ( ( toLowerCase ) ? line . substring ( 0 , p ) . trim ( ) . toLowerCase ( ) : line . substring ( 0 , p ) . trim ( ) ) ;
//c = Integer.parseInt(line.substring(p + 1));
//if (c < 1) continue;
w = ( toLowerCase ) ? line . substring ( 0 , p ) . trim ( ) . toLowerCase ( ) : line . substring ( 0 , p ) . trim ( ) ;
if ( w . length ( ) < 4 ) continue ;
list . add ( w ) ;
}
}
reader . close ( ) ;
@ -217,12 +251,12 @@ public class LibraryProvider {
return list ;
}
public static void main ( String [ ] args ) {
File here = new File ( "dummy" ) . getParentFile ( ) ;
public static void main ( final String [ ] args ) {
final File here = new File ( "dummy" ) . getParentFile ( ) ;
initialize ( new File ( here , "DATA/DICTIONARIES" ) ) ;
System . out . println ( "dymDict-size = " + dymLib . size ( ) ) ;
Set < String > r = dymLib . recommend ( "da" ) ;
for ( String s : r ) {
final Set < String > r = dymLib . recommend ( "da" ) ;
for ( final String s : r ) {
System . out . println ( "$ " + s ) ;
}
System . out . println ( "recommendations: " + r . size ( ) ) ;