diff --git a/htroot/DictionaryLoader_p.html b/htroot/DictionaryLoader_p.html index b4b0f92d6..13a22110b 100644 --- a/htroot/DictionaryLoader_p.html +++ b/htroot/DictionaryLoader_p.html @@ -94,6 +94,48 @@ +
+
+ Suggestions + Suggestion dictionaries will help YaCy to provide better suggestions during the input of search words + +

DeReWo - Korpusbasierte Grund-/Wortformenlisten (German) of 'Institut für Deutsche Sprache'

+

This file provides 100000 most common german words for suggestions

+ +
+
+
#[drw0URL]#
+
+
#[drw0Storage]#
+
+
#(drw0Status)#
not loaded
::
loaded
::deactivated#(/drw0Status)#
+
Action
+
#(drw0Status)# + :: + + :: + + + #(/drw0Status)#
+ #(drw0ActionLoaded)#:: +
Result
loaded and activated dictionary file
:: +
Result
loading of dictionary file failed: #[error]#
+ #(/drw0ActionLoaded)# + #(drw0ActionRemoved)#:: +
Result
deactivated and removed dictionary file
:: +
Result
cannot remove dictionary file: #[error]#
+ #(/drw0ActionRemoved)# + #(drw0ActionDeactivated)#:: +
Result
deactivated dictionary file
:: +
Result
cannot deactivate dictionary file: #[error]#
+ #(/drw0ActionDeactivated)# + #(drw0ActionActivated)#:: +
Result
activated dictionary file
:: +
Result
cannot activate dictionary file: #[error]#
+ #(/drw0ActionActivated)# +
+
+
#%env/templates/footer.template%# diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index c830bafdd..b64484979 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -140,6 +140,50 @@ public class DictionaryLoader_p { prop.put("geo1ActionActivated", 1); } + // DRW0 + if (post.containsKey("drw0Load")) { + // load from the net + try { + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false); + final byte[] b = response.getContent(); + FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file()); + LibraryProvider.integrateDeReWo(); + LibraryProvider.initDidYouMean(); + prop.put("drw0Status", LibraryProvider.Dictionary.DRW0.file().exists() ? 1 : 0); + prop.put("drw0ActionLoaded", 1); + } catch (final MalformedURLException e) { + Log.logException(e); + prop.put("drw0ActionLoaded", 2); + prop.put("drw0ActionLoaded_error", e.getMessage()); + } catch (final IOException e) { + Log.logException(e); + prop.put("drw0ActionLoaded", 2); + prop.put("drw0ActionLoaded_error", e.getMessage()); + } + } + + if (post.containsKey("drw0Remove")) { + LibraryProvider.removeDeReWo(); + LibraryProvider.initDidYouMean(); + FileUtils.deletedelete(LibraryProvider.Dictionary.DRW0.file()); + FileUtils.deletedelete(LibraryProvider.Dictionary.DRW0.fileDisabled()); + prop.put("drw0ActionRemoved", 1); + } + + if (post.containsKey("drw0Deactivate")) { + LibraryProvider.removeDeReWo(); + LibraryProvider.initDidYouMean(); + LibraryProvider.Dictionary.DRW0.file().renameTo(LibraryProvider.Dictionary.DRW0.fileDisabled()); + prop.put("drw0ActionDeactivated", 1); + } + + if (post.containsKey("drw0Activate")) { + LibraryProvider.Dictionary.DRW0.fileDisabled().renameTo(LibraryProvider.Dictionary.DRW0.file()); + LibraryProvider.integrateDeReWo(); + LibraryProvider.initDidYouMean(); + prop.put("drw0ActionActivated", 1); + } + // check status again for (final LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) { prop.put(dictionary.nickname + "Status", dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0); diff --git a/source/net/yacy/document/LibraryProvider.java b/source/net/yacy/document/LibraryProvider.java index 221ba0ba3..e89c00e11 100644 --- a/source/net/yacy/document/LibraryProvider.java +++ b/source/net/yacy/document/LibraryProvider.java @@ -2,19 +2,19 @@ * LibraryProvider.java * Copyright 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany * first published 01.10.2009 on http://yacy.net - * + * * This file is part of YaCy Content Integration * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -25,56 +25,60 @@ package net.yacy.document; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; -import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; +import java.net.MalformedURLException; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.TreeSet; +import java.util.zip.ZipException; +import java.util.zip.ZipFile; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.geolocalization.GeonamesLocalization; import net.yacy.document.geolocalization.OpenGeoDBLocalization; import net.yacy.document.geolocalization.OverarchingLocalization; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.FileUtils; public class LibraryProvider { private static final String path_to_source_dictionaries = "source"; private static final String path_to_did_you_mean_dictionaries = "didyoumean"; - + public static final String disabledExtension = ".disabled"; - + public static WordCache dymLib = new WordCache(null); public static OverarchingLocalization geoLoc = new OverarchingLocalization(); private static File dictSource = null; private static File dictRoot = null; - + public static enum Dictionary { - GEODB0("geo0", - "http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz", - "opengeodb-0.2.5a-UTF8-sql.gz"), - GEODB1("geo1", - "http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02621_2010-03-16.sql.gz", - "opengeodb-02621_2010-03-16.sql.gz"), - GEON0("geon0", - "http://download.geonames.org/export/dump/cities1000.zip", - "cities1000.zip"); + GEODB0("geo0", "http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz"), + GEODB1("geo1", "http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02621_2010-03-16.sql.gz"), + GEON0("geon0", "http://download.geonames.org/export/dump/cities1000.zip"), + DRW0("drw0", "http://www.ids-mannheim.de/kl/derewo/derewo-v-100000t-2009-04-30-0.1.zip"); public String nickname, url, filename; - private Dictionary(String nickname, String url, String filename) { + private Dictionary(final String nickname, final String url) { + try { + this.filename = new MultiProtocolURI(url).getFileName(); + } catch (final MalformedURLException e) { + assert false; + } this.nickname = nickname; this.url = url; - this.filename = filename; } public File file() { - return new File(dictSource, filename); + return new File(dictSource, this.filename); } public File fileDisabled() { - return new File(dictSource, filename + disabledExtension); + return new File(dictSource, this.filename + disabledExtension); } } @@ -83,7 +87,7 @@ public class LibraryProvider { * This assigns default paths, and initializes the dictionary classes * Additionally, if default dictionaries are given in the source path, * they are translated into the input format inside the DATA/DICTIONARIES directory - * + * * @param pathToSource * @param pathToDICTIONARIES */ @@ -91,17 +95,17 @@ public class LibraryProvider { dictSource = new File(rootPath, path_to_source_dictionaries); if (!dictSource.exists()) dictSource.mkdirs(); dictRoot = rootPath; - + // initialize libraries integrateDeReWo(); initDidYouMean(); integrateOpenGeoDB(); integrateGeonames(); } - + public static void integrateOpenGeoDB() { - File geo1 = Dictionary.GEODB1.file(); - File geo0 = Dictionary.GEODB0.file(); + final File geo1 = Dictionary.GEODB1.file(); + final File geo0 = Dictionary.GEODB0.file(); if (geo1.exists()) { if (geo0.exists()) geo0.renameTo(Dictionary.GEODB0.fileDisabled()); geoLoc.addLocalization(Dictionary.GEODB1.nickname, new OpenGeoDBLocalization(geo1, false)); @@ -112,39 +116,45 @@ public class LibraryProvider { return; } } - + public static void integrateGeonames() { - File geon = Dictionary.GEON0.file(); + final File geon = Dictionary.GEON0.file(); if (geon.exists()) { geoLoc.addLocalization(Dictionary.GEON0.nickname, new GeonamesLocalization(geon)); return; } } - + public static void initDidYouMean() { final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries); if (!dymDict.exists()) dymDict.mkdirs(); dymLib = new WordCache(dymDict); } - + + public static void removeDeReWo() { + final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries); + final File derewoInput = LibraryProvider.Dictionary.DRW0.file(); + final File derewoOutput = new File(dymDict, derewoInput.getName() + ".words"); + FileUtils.deletedelete(derewoOutput); + } + public static void integrateDeReWo() { // translate input files (once..) final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries); if (!dymDict.exists()) dymDict.mkdirs(); - final File pathToSource = new File(dictRoot, path_to_source_dictionaries); - final File derewoInput = new File(pathToSource, "derewo-v-30000g-2007-12-31-0.1.txt"); - final File derewoOutput = new File(dymDict, "derewo-v-30000g-2007-12-31-0.1.words"); + final File derewoInput = LibraryProvider.Dictionary.DRW0.file(); + final File derewoOutput = new File(dymDict, derewoInput.getName() + ".words"); if (!derewoOutput.exists() && derewoInput.exists()) { // create the translation of the derewo file (which is easy in this case) final ArrayList derewo = loadDeReWo(derewoInput, true); try { writeWords(derewoOutput, derewo); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } } } - + /* private static ArrayList loadList(final File file, String comment, boolean toLowerCase) { final ArrayList list = new ArrayList(); @@ -168,44 +178,68 @@ public class LibraryProvider { return list; } */ - + private static Set sortUnique(final List list) { final Set s = new TreeSet(); for (final String t: list) s.add(t); return s; } - + private static void writeWords(final File f, final ArrayList list) throws IOException { final Set s = sortUnique(list); final PrintWriter w = new PrintWriter(new BufferedWriter(new FileWriter(f))); for (final String t: s) w.println(t); w.close(); } - + private static ArrayList loadDeReWo(final File file, final boolean toLowerCase) { final ArrayList list = new ArrayList(); + + // get the zip file entry from the file + InputStream derewoTxtEntry; + try { + final ZipFile zip = new ZipFile(file); + /* + final Enumeration i = zip.entries(); + while (i.hasMoreElements()) { + final ZipEntry e = i.nextElement(); + System.out.println("loadDeReWo: " + e.getName()); + } + */ + derewoTxtEntry = zip.getInputStream(zip.getEntry("derewo-v-100000t-2009-04-30-0.1")); + } catch (final ZipException e) { + Log.logException(e); + return list; + } catch (final IOException e) { + Log.logException(e); + return list; + } + BufferedReader reader = null; try { - reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); + reader = new BufferedReader(new InputStreamReader(derewoTxtEntry, "UTF-8")); String line; - + // read until text starts while ((line = reader.readLine()) != null) { - if (line.startsWith("-----")) break; + if (line.startsWith("# -----")) break; } // read empty line line = reader.readLine(); - + // read lines int p; - int c; + //int c; + String w; while ((line = reader.readLine()) != null) { line = line.trim(); - p = line.indexOf("\t"); + p = line.indexOf(" "); if (p > 0) { - c = Integer.parseInt(line.substring(p + 1)); - if (c < 1) continue; - list.add((toLowerCase) ? line.substring(0, p).trim().toLowerCase() : line.substring(0, p).trim()); + //c = Integer.parseInt(line.substring(p + 1)); + //if (c < 1) continue; + w = (toLowerCase) ? line.substring(0, p).trim().toLowerCase() : line.substring(0, p).trim(); + if (w.length() < 4) continue; + list.add(w); } } reader.close(); @@ -216,13 +250,13 @@ public class LibraryProvider { } return list; } - - public static void main(String[] args) { - File here = new File("dummy").getParentFile(); + + public static void main(final String[] args) { + final File here = new File("dummy").getParentFile(); initialize(new File(here, "DATA/DICTIONARIES")); System.out.println("dymDict-size = " + dymLib.size()); - Set r = dymLib.recommend("da"); - for (String s: r) { + final Set r = dymLib.recommend("da"); + for (final String s: r) { System.out.println("$ " + s); } System.out.println("recommendations: " + r.size()); diff --git a/source/net/yacy/document/WordCache.java b/source/net/yacy/document/WordCache.java index 2df8982a7..e45a1c1da 100644 --- a/source/net/yacy/document/WordCache.java +++ b/source/net/yacy/document/WordCache.java @@ -7,12 +7,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -45,12 +45,12 @@ import net.yacy.kelondro.util.MemoryControl; * */ public class WordCache { - + // common word cache private static final int commonWordsMaxSize = 100000; // maximum size of common word cache private static final int commonWordsMinLength = 5; // words must have that length at minimum - private OrderedScoreMap commonWords = new OrderedScoreMap(String.CASE_INSENSITIVE_ORDER); - + private static OrderedScoreMap commonWords = new OrderedScoreMap(String.CASE_INSENSITIVE_ORDER); + // dictionaries private final File dictionaryPath; private TreeSet dict; // the word dictionary @@ -67,12 +67,12 @@ public class WordCache { this.dictionaryPath = dictionaryPath; reload(); } - + /** * add a word to the generic dictionary * @param word */ - public void learn(String word) { + public static void learn(final String word) { if (word == null) return; if (word.length() < commonWordsMinLength) return; if (MemoryControl.shortStatus()) commonWords.clear(); @@ -81,24 +81,24 @@ public class WordCache { commonWords.shrinkToMaxSize(commonWordsMaxSize / 2); } } - + /** * scan the input directory and load all dictionaries (again) */ public void reload() { this.dict = new TreeSet(); this.tcid = new TreeSet(); - if (dictionaryPath == null || !dictionaryPath.exists()) return; - final String[] files = dictionaryPath.list(); + if (this.dictionaryPath == null || !this.dictionaryPath.exists()) return; + final String[] files = this.dictionaryPath.list(); for (final String f: files) { if (f.endsWith(".words")) try { - inputStream(new File(dictionaryPath, f)); - } catch (IOException e) { + inputStream(new File(this.dictionaryPath, f)); + } catch (final IOException e) { Log.logException(e); } } } - + private void inputStream(final File file) throws IOException { InputStream is = new FileInputStream(file); if (file.getName().endsWith(".gz")) { @@ -110,22 +110,23 @@ public class WordCache { while ((l = reader.readLine()) != null) { if (l.length() == 0 || l.charAt(0) == '#') continue; l = l.trim().toLowerCase(); + if (l.length() < 4) continue; this.dict.add(l); this.tcid.add(reverse(l)); } - } catch (IOException e) { + } catch (final IOException e) { // finish } } - + private static String reverse(final String s) { - StringBuilder sb = new StringBuilder(s.length()); + final StringBuilder sb = new StringBuilder(s.length()); for (int i = s.length() - 1; i >= 0; i--) { sb.append(s.charAt(i)); } return sb.toString(); } - + /** * read the dictionary and construct a set of recommendations to a given string * @param s input value that is used to match recommendations @@ -138,14 +139,14 @@ public class WordCache { for (final String r: t) { if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break; } - SortedMap u = this.commonWords.tailMap(string); + final SortedMap u = commonWords.tailMap(string); String vv; try { for (final Map.Entry v: u.entrySet()) { vv = v.getKey(); if (vv.startsWith(string) && vv.length() > string.length()) ret.add(vv); else break; } - } catch (ConcurrentModificationException e) {} + } catch (final ConcurrentModificationException e) {} string = reverse(string); t = this.tcid.tailSet(string); for (final String r: t) { @@ -153,7 +154,7 @@ public class WordCache { } return ret; } - + /** * check if the library contains the given word * @param s the given word @@ -164,7 +165,7 @@ public class WordCache { // if the above case is true then it is also true for this.tcid and vice versa // that means it does not need to be tested as well } - + /** * check if the library supports the given word * A word is supported, if the library contains a word @@ -185,7 +186,7 @@ public class WordCache { } return false; } - + /** * the size of the dictionay * @return the number of words in the dictionary @@ -193,7 +194,7 @@ public class WordCache { public int size() { return this.dict.size(); } - + /** * a property that is used during the construction of recommendation: @@ -206,5 +207,5 @@ public class WordCache { public boolean isRelevant(final int minimumWords) { return this.dict.size() >= minimumWords; } - + }