diff --git a/langstats/da.lng b/langstats/da.lng new file mode 100644 index 000000000..538808b02 --- /dev/null +++ b/langstats/da.lng @@ -0,0 +1,29 @@ +y 0.864537466956212 +g 3.84157778337673 +e 14.5280687230801 +w 0.407923310417641 +ø 0.66857430803009 +a 6.5529680420081 +u 1.99592476614396 +k 3.31867608889529 +é 0.158670170008038 +s 5.99487296388279 +c 0.872935888053046 +å 0.720164609053498 +i 6.71533751654689 +m 3.18580106939895 +o 5.08944318468128 +d 5.55245613824601 +b 1.93183681467884 +z 0.0918827260475031 +æ 0.655576751570705 +j 0.586489739928893 +t 6.78602422744524 +v 2.02042016100973 +r 8.68806664347103 +x 0.0790851319951849 +h 1.88434574300032 +f 2.39105048250929 +n 7.4213047946986 +l 5.00085983835039 +p 1.83535495326879 diff --git a/langstats/de.lng b/langstats/de.lng new file mode 100644 index 000000000..e55068456 --- /dev/null +++ b/langstats/de.lng @@ -0,0 +1,29 @@ +e 15.0454339050204 +a 6.37621568754647 +u 3.72113911607228 +k 1.72703729023308 +ö 0.26077714183063 +o 3.51445352069101 +d 4.46714337752806 +z 1.09540371555032 +j 0.327643075633355 +t 6.21244404968039 +n 9.08069321011372 +p 1.28761832525786 +y 0.346704856762192 +g 2.85268037584643 +w 1.39650001746498 +ü 0.468760135927465 +ß 0.128941472347942 +s 6.2645395981058 +c 2.95936647022719 +i 8.17341230832182 +m 2.69958732740855 +b 2.41585620830235 +ä 0.436225368136886 +v 1.0019910080289 +r 7.73519094216097 +x 0.122155079066472 +h 4.05966038093622 +f 1.59789621808274 +l 3.98660685325922 diff --git a/langstats/en.lng b/langstats/en.lng new file mode 100644 index 000000000..005ad7f91 --- /dev/null +++ b/langstats/en.lng @@ -0,0 +1,26 @@ +a 8.69161966252553 +k 0.907674180694932 +z 0.168080439775654 +j 0.285209319358806 +p 2.35362373068916 +y 1.58228477941557 +w 1.59124110835566 +c 3.59944908625542 +m 2.70381619224661 +b 1.65463201429828 +v 1.03186860866415 +x 0.255951978154519 +l 4.34700400845477 +e 11.6362615884945 +u 2.72252496825479 +o 7.16964083130655 +d 3.78454655101724 +t 8.19514049494664 +n 7.31274306481596 +g 2.03308666940001 +s 6.65176598903745 +i 7.96287303110036 +q 0.120810925925189 +r 6.70550396267798 +h 4.22022219656953 +f 2.00084388521569 diff --git a/langstats/fr.lng b/langstats/fr.lng new file mode 100644 index 000000000..020e5e961 --- /dev/null +++ b/langstats/fr.lng @@ -0,0 +1,31 @@ +e 13.3154384090329 +a 8.04623708216382 +u 4.83551687292677 +k 0.470564078846645 +é 2.27666895253891 +o 5.83355926256698 +d 4.04459843072212 +ê 0.0811351747894871 +z 0.207721995406992 +j 0.359825529471804 +t 6.45612720081653 +n 7.08926065322787 +p 2.86285563919367 +y 0.532362369226844 +g 1.49830951773412 +w 0.338694501148252 +ç 0.0542230160755295 +s 7.23199476907374 +c 3.58390214340393 +i 7.9445689270222 +m 3.09719076932891 +q 0.619478023730544 +b 1.31321367057923 +v 1.23068305052309 +r 7.16012933784129 +x 0.411855702985455 +è 0.318460544781832 +h 1.36913115590712 +f 1.32058959556009 +l 5.38871124649145 +à 0.286464978310794 diff --git a/langstats/nl.lng b/langstats/nl.lng new file mode 100644 index 000000000..7ab70bae5 --- /dev/null +++ b/langstats/nl.lng @@ -0,0 +1,28 @@ +e 16.3410628627746 +a 8.06098061970175 +u 2.16762673559572 +k 2.33964949737568 +é 0.0744665645125339 +o 6.12804850622071 +d 4.82913171985778 +z 0.837324041505364 +j 1.12649420410236 +t 6.31816314474801 +n 9.22705705138393 +p 1.86976047754558 +y 0.38192851409717 +g 2.88130629347423 +ë 0.125943451390326 +w 1.46584183701518 +s 4.90249877804866 +c 2.04528166853083 +i 7.05403273930006 +m 2.55985062706845 +q 0.0503773805561303 +b 1.95512215015458 +v 2.33355223504646 +r 6.80954251541058 +x 0.13773815556815 +h 2.40212144747009 +f 1.00874707256442 +l 4.1423400892999 diff --git a/source/de/anomic/language/identification/Identificator.java b/source/de/anomic/language/identification/Identificator.java new file mode 100644 index 000000000..6ff530085 --- /dev/null +++ b/source/de/anomic/language/identification/Identificator.java @@ -0,0 +1,218 @@ +// Identificator.java +// ----------------------- +// (C) by Marc Nause; marc.nause@audioattack.de +// first published on http://www.yacy.net +// Braunschweig, Germany, 2008 +// +// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $ +// $LastChangedRevision: 4824 $ +// $LastChangedBy: low012 $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.language.identification; + +import java.io.File; +import java.io.FilenameFilter; +import java.util.Iterator; +import java.util.List; +import java.util.Vector; + +/** + * This class can try to identify the language a text is written in. + * It has been implemented as a Singleton since it has to access several + * files on instanciation which should be avoided since it is very slow. + */ +public final class Identificator { + + private List languages = new Vector(); + private String languageDir = "langstats"; // directory that contains language files + + private static Identificator instance; + + private Identificator() { + addAllLanguagesInDirectory(languageDir); + } + + /** + * method to get an instance of this class, should be used insted of new Identificator() + * @return an instance of the class Identificator + */ + public synchronized static Identificator getInstance() { + if (instance == null) { + instance = new Identificator(); + } + return instance; + } + + /** + * This method tries to return the language a text is written in. The method will only + * use the first 100000 characters of the text which should be enough. Using more + * characters probably only slows down the process without gaining much accuracy. + * @param text the text that is to be analyzed + * @return the language or "unknown" if the method was not able to find out the language + */ + public String getLanguage(String text) { + // only test the first 100000 characters of a text + return getLanguage(text, 100000); + } + + /** + * This method tries to return the language a text is written in. The method will + * use the number characters defined in the parameter limit. + * @param text the text that is to be analyzed + * @param limit the number of characters that are supposed to be considered + * @return the language or "unknown" if the method was not able to find out the language + */ + public String getLanguage(String text, int limit) { + + String ret = null; + + LanguageStatistics testStat = new LanguageStatistics("text"); + char[] letter = new char[1]; + float letters = 0; + int upperLimit = text.length(); + if (upperLimit > limit) { + upperLimit = limit; + } + + // count number of characters in text + for (int i = 0; i < upperLimit; i++) { + text.getChars(i, i + 1, letter, 0); + // only count if character is a letter + if ((letter[0]+"").matches("\\p{L}")) { + letter[0] = Character.toLowerCase(letter[0]); + testStat.put(letter[0], testStat.get(letter[0]) + 1); + letters++; + } + } + + //calculate percentage + Iterator iter = testStat.keySet().iterator(); + Character character; + Character maxChar = null; + float value = 0; + float max = 0; + while (iter.hasNext()) { + character = (Character) iter.next(); + value = testStat.get(character); + if (value > max) { + maxChar = character; + max = value; + } + testStat.put(character, (value / letters) * 100); + } + + // create list with relevant languages + List relevantLanguages = new Vector (); + for (int i = 0; i < languages.size(); i++) { + + // only languages that contain the most common character in the text will be tested + if (languages.get(i).contains(maxChar)) { + relevantLanguages.add(i); + } + } + + if (relevantLanguages.size() > 0) { + + // compare characters in text with characters in statistics + float[] offsetList = new float[relevantLanguages.size()]; + int[] votesList = new int[relevantLanguages.size()]; + + iter = testStat.keySet().iterator(); + float minimum; + float offset = 0; + float valueCharacter; + int bestLanguage = -1; + + while (iter.hasNext()) { + minimum = 100.1f; + character = (Character) iter.next(); + valueCharacter = testStat.get(character); + for (int i = 0; i < relevantLanguages.size(); i++) { + value = languages.get(relevantLanguages.get(i)).get(character); + offset = Math.abs(value - valueCharacter); + offsetList[i] = offsetList[i] + offset; + if (offset < minimum) { + minimum = offset; + bestLanguage = i; + } + } + votesList[bestLanguage] = ++votesList[bestLanguage]; + } + + // Now we can count how many votes each language got and how far it was away from the stats. + // If 2 languages have the same amount of votes, the one with the smaller offset wins. + int maxVotes = 0; + float minOffset = 100.1f; + for (int i = 0; i < votesList.length; i++) { + if ((votesList[i] == maxVotes && offsetList[i] < minOffset) || (votesList[i] > maxVotes)) { + maxVotes = votesList[i]; + minOffset = offsetList[i]; + bestLanguage = i; + } + } + + // Only return name of language of offset is smaller than 20%. This + // prevents a language beeing reported that has won the voting, but + // is still not the right language. + if (offset < 20) { + ret = languages.get(relevantLanguages.get(bestLanguage)).getName(); + } + + } + + return ret; + + } + + /** + * Reads all language files from a directory. + * @param directory the directory that contains the language files + */ + private void addAllLanguagesInDirectory(String directory) { + + File folder = new File(directory); + FilenameFilter filter = new LanguageFilenameFilter(); + File[] allLanguageFiles = folder.listFiles(filter); + + for (int i = 0; i < allLanguageFiles.length; i++) { + if(allLanguageFiles[i].isFile()) { + languages.add(new LanguageStatistics(allLanguageFiles[i])); + } + } + } + + +} diff --git a/source/de/anomic/language/identification/LanguageFilenameFilter.java b/source/de/anomic/language/identification/LanguageFilenameFilter.java new file mode 100644 index 000000000..a8a31e507 --- /dev/null +++ b/source/de/anomic/language/identification/LanguageFilenameFilter.java @@ -0,0 +1,60 @@ +// LanguageFilenameFilter.java +// ----------------------- +// (C) by Marc Nause; marc.nause@audioattack.de +// first published on http://www.yacy.net +// Braunschweig, Germany, 2008 +// +// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $ +// $LastChangedRevision: 4824 $ +// $LastChangedBy: low012 $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.language.identification; + +import java.io.File; +import java.io.FilenameFilter; + +class LanguageFilenameFilter implements FilenameFilter { + + private String fileExtension = "lng"; + + public boolean accept(File dir, String name) { + if (name.matches(".+\\."+fileExtension)) { + return true; + } else { + return false; + } + } +} diff --git a/source/de/anomic/language/identification/LanguageStatistics.java b/source/de/anomic/language/identification/LanguageStatistics.java new file mode 100644 index 000000000..e8dd144a7 --- /dev/null +++ b/source/de/anomic/language/identification/LanguageStatistics.java @@ -0,0 +1,190 @@ +// LanguageStatistics.java +// ----------------------- +// (C) by Marc Nause; marc.nause@audioattack.de +// first published on http://www.yacy.net +// Braunschweig, Germany, 2008 +// +// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $ +// $LastChangedRevision: 4824 $ +// $LastChangedBy: low012 $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.language.identification; + +import de.anomic.server.logging.serverLog; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * This class can store statistical data of a language. + */ +public class LanguageStatistics { + + private static serverLog logger = new serverLog("LANGUAGESTATISTICS"); + + /** This variable holds the name of the language. */ + private String langName = null; + + /** This map holds the character statistics of the language. */ + private Map stats = new HashMap(); + + LanguageStatistics(File file) { + loadStatisticsFromFile(file); + } + + /** + * This class provides means to store statistics about how often + * a letter occurs in a text in a language. + * @param name name of the language + */ + LanguageStatistics(final String name) { + this.langName = name; + } + + /** + * This class provides means to store statistics about how often + * a letter occurs in a text in a language. + * @param name name of the language + * @param statistics statistics about occurence of characters + */ + LanguageStatistics(final String name, final Map statistics) { + this.langName = name; + this.stats = statistics; + } + + /** + * This method can be used to add a character and its number + * of average occuences in text in a language in percent. + * @param letter the letter + * @param percent percentage of occurence + */ + public final void put(final char letter, final float percent) { + stats.put(letter, percent); + } + + /** + * Gets the percantage of occurences of a letter in an average + * text in a language in percent. + * @param letter the letter + * @return the percentage + */ + public final float get(final char letter) { + if (stats.containsKey(letter)) { + return stats.get(letter); + } else { + return 0; + } + } + + /** + * This method allows to add the statistics a whole which might + * be more convenient than adding them character by cahracter. + * @param statistics the statistics + */ + public final void setStatistics(final Map statistics) { + this.stats = statistics; + } + + public final boolean loadStatisticsFromFile(File file) { + boolean ret = true; + BufferedReader reader = null; + String line; + String splitLine[] = new String[2]; + try { + reader = new BufferedReader(new FileReader(file)); + while(reader.ready()) { + line = reader.readLine().trim(); + if (line.matches("^\\p{L}\\p{Z}+\\p{N}*\\p{P}{0,1}\\p{N}+$")) { + splitLine = line.split("\\p{Z}+"); + this.put(splitLine[0].charAt(0), Float.parseFloat(splitLine[1])); + } + } + + if (!stats.isEmpty() && langName == null) { + langName = file.getName().toLowerCase(); + langName = langName.substring(0, langName.lastIndexOf(".")); + } + + } catch (FileNotFoundException ex) { + ret = false; + logger.logWarning("ERROR: file '" + file.getName() + "' not found", ex); + } catch (IOException ex) { + logger.logWarning("ERROR: problems reading file '" + file.getName() + "'", ex); + } finally { + try { + reader.close(); + } catch (IOException ex) { + logger.logWarning("ERROR: IO trouble ", ex); + } + } + return ret; + } + + /** + * This method tells if a language contains a character or not + * @param character the character in question + * @return true if language contains character, else false + */ + public boolean contains(Character character) { + if (stats.containsKey(character)) { + return true; + } else { + return false; + } + } + + /** + * This method is needed to crteate an iterator over a language + * @return all characters of the language + */ + public Set keySet() { + return stats.keySet(); + } + + /** + * This method tells the name of the language. + * @return the name of the language + */ + public String getName() { + return langName; + } + +}