You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
193 lines
6.7 KiB
193 lines
6.7 KiB
// LanguageStatistics.java
|
|
// -----------------------
|
|
// (C) by Marc Nause; marc.nause@audioattack.de
|
|
// first published on http://www.yacy.net
|
|
// Braunschweig, Germany, 2008
|
|
//
|
|
// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $
|
|
// $LastChangedRevision: 4824 $
|
|
// $LastChangedBy: low012 $
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
//
|
|
// Using this software in any meaning (reading, learning, copying, compiling,
|
|
// running) means that you agree that the Author(s) is (are) not responsible
|
|
// for cost, loss of data or any harm that may be caused directly or indirectly
|
|
// by usage of this softare or this documentation. The usage of this software
|
|
// is on your own risk. The installation and usage (starting/running) of this
|
|
// software may allow other people or application to access your computer and
|
|
// any attached devices and is highly dependent on the configuration of the
|
|
// software which must be done by the user of the software; the author(s) is
|
|
// (are) also not responsible for proper configuration and usage of the
|
|
// software, even if provoked by documentation provided together with
|
|
// the software.
|
|
//
|
|
// Any changes to this file according to the GPL as documented in the file
|
|
// gpl.txt aside this file in the shipment you received can be done to the
|
|
// lines that follows this copyright notice here, but changes must not be
|
|
// done inside the copyright notive above. A re-distribution must contain
|
|
// the intact and unchanged copyright notice.
|
|
// Contributions and changes to the program code must be marked as such.
|
|
|
|
package de.anomic.language.identification;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.File;
|
|
import java.io.FileNotFoundException;
|
|
import java.io.FileReader;
|
|
import java.io.IOException;
|
|
import java.util.HashMap;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
|
|
import de.anomic.server.logging.serverLog;
|
|
|
|
/**
|
|
* This class can store statistical data of a language.
|
|
*/
|
|
public class LanguageStatistics {
|
|
|
|
private static serverLog logger = new serverLog("LANGUAGESTATISTICS");
|
|
|
|
/** This variable holds the name of the language. */
|
|
private String langName = null;
|
|
|
|
/** This map holds the character statistics of the language. */
|
|
private Map<Character, Float> stats = new HashMap<Character, Float>();
|
|
|
|
LanguageStatistics(File file) {
|
|
loadStatisticsFromFile(file);
|
|
}
|
|
|
|
/**
|
|
* This class provides means to store statistics about how often
|
|
* a letter occurs in a text in a language.
|
|
* @param name name of the language
|
|
*/
|
|
LanguageStatistics(final String name) {
|
|
this.langName = name;
|
|
}
|
|
|
|
/**
|
|
* This class provides means to store statistics about how often
|
|
* a letter occurs in a text in a language.
|
|
* @param name name of the language
|
|
* @param statistics statistics about occurence of characters
|
|
*/
|
|
LanguageStatistics(final String name, final Map<Character, Float> statistics) {
|
|
this.langName = name;
|
|
this.stats = statistics;
|
|
}
|
|
|
|
/**
|
|
* This method can be used to add a character and its number
|
|
* of average occuences in text in a language in percent.
|
|
* @param letter the letter
|
|
* @param percent percentage of occurence
|
|
*/
|
|
public final void put(final char letter, final float percent) {
|
|
stats.put(letter, percent);
|
|
}
|
|
|
|
/**
|
|
* Gets the percantage of occurences of a letter in an average
|
|
* text in a language in percent.
|
|
* @param letter the letter
|
|
* @return the percentage
|
|
*/
|
|
public final float get(final char letter) {
|
|
if (stats.containsKey(letter)) {
|
|
return stats.get(letter);
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This method allows to add the statistics a whole which might
|
|
* be more convenient than adding them character by cahracter.
|
|
* @param statistics the statistics
|
|
*/
|
|
public final void setStatistics(final Map<Character, Float> statistics) {
|
|
this.stats = statistics;
|
|
}
|
|
|
|
public final boolean loadStatisticsFromFile(File file) {
|
|
boolean ret = true;
|
|
BufferedReader reader = null;
|
|
String line;
|
|
String splitLine[] = new String[2];
|
|
try {
|
|
reader = new BufferedReader(new FileReader(file));
|
|
while(reader.ready()) {
|
|
line = reader.readLine().trim();
|
|
if (line.matches("^\\p{L}\\p{Z}+\\p{N}*\\p{P}{0,1}\\p{N}+$")) {
|
|
splitLine = line.split("\\p{Z}+");
|
|
this.put(splitLine[0].charAt(0), Float.parseFloat(splitLine[1]));
|
|
}
|
|
}
|
|
|
|
if (!stats.isEmpty() && langName == null) {
|
|
langName = file.getName().toLowerCase();
|
|
langName = langName.substring(0, langName.lastIndexOf("."));
|
|
}
|
|
|
|
} catch (FileNotFoundException ex) {
|
|
ret = false;
|
|
logger.logWarning("ERROR: file '" + file.getName() + "' not found", ex);
|
|
} catch (IOException ex) {
|
|
logger.logWarning("ERROR: problems reading file '" + file.getName() + "'", ex);
|
|
} finally {
|
|
try { if(reader != null) {
|
|
reader.close();
|
|
}
|
|
} catch (IOException ex) {
|
|
logger.logWarning("ERROR: IO trouble ", ex);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* This method tells if a language contains a character or not
|
|
* @param character the character in question
|
|
* @return true if language contains character, else false
|
|
*/
|
|
public boolean contains(Character character) {
|
|
if (stats.containsKey(character)) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This method is needed to crteate an iterator over a language
|
|
* @return all characters of the language
|
|
*/
|
|
public Set<Character> keySet() {
|
|
return stats.keySet();
|
|
}
|
|
|
|
/**
|
|
* This method tells the name of the language.
|
|
* @return the name of the language
|
|
*/
|
|
public String getName() {
|
|
return langName;
|
|
}
|
|
|
|
}
|