diff --git a/source/de/anomic/http/httpdProxyCacheEntry.java b/source/de/anomic/http/httpdProxyCacheEntry.java index 8a8edd744..cf5dda4a7 100755 --- a/source/de/anomic/http/httpdProxyCacheEntry.java +++ b/source/de/anomic/http/httpdProxyCacheEntry.java @@ -58,7 +58,6 @@ public class httpdProxyCacheEntry implements indexDocumentMetadata { private byte[] cacheArray; // or the cache as byte-array private final yacyURL url; private final String name; // the name of the link, read as anchor from an -tag - private final String language; private final CrawlProfile.entry profile; private final String initiator; private httpRequestHeader requestHeader; @@ -166,7 +165,6 @@ public class httpdProxyCacheEntry implements indexDocumentMetadata { // in case of proxy usage, the initiator hash is null, // which distinguishes local crawling from proxy indexing this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null : initiator); - this.language = yacyURL.language(url); // to be defined later: this.cacheArray = null; @@ -205,7 +203,9 @@ public class httpdProxyCacheEntry implements indexDocumentMetadata { } public String language() { - return this.language; + // please avoid this method if a condenser document is available, because the condenser has a built-in language detection + // this here is only a guess using the TLD + return this.url().language(); } public CrawlProfile.entry profile() { diff --git a/source/de/anomic/language/identification/Identificator.java b/source/de/anomic/language/identification/Identificator.java index 95612efb8..3f5fe217e 100644 --- a/source/de/anomic/language/identification/Identificator.java +++ b/source/de/anomic/language/identification/Identificator.java @@ -24,8 +24,10 @@ package de.anomic.language.identification; +import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.Vector; /** @@ -33,12 +35,18 @@ import java.util.Vector; */ public final class Identificator { - private final LanguageStatisticsHolder languages; + private static final LanguageStatisticsHolder languages = LanguageStatisticsHolder.getInstance(); + + private HashMap letter; + private int letters; + private String language; public Identificator() { - languages = LanguageStatisticsHolder.getInstance(); + letter = new HashMap(); + letters = 0; + language = null; } - + /** * This method tries to return the language a text is written in. The method will only * use the first 100000 characters of the text which should be enough. Using more @@ -46,7 +54,7 @@ public final class Identificator { * @param text the text that is to be analyzed * @return the language or "unknown" if the method was not able to find out the language */ - public String getLanguage(final String text) { + public static String getLanguage(final String text) { // only test the first 100000 characters of a text return getLanguage(text, 100000); } @@ -56,47 +64,62 @@ public final class Identificator { * use the number characters defined in the parameter limit. * @param text the text that is to be analyzed * @param limit the number of characters that are supposed to be considered - * @return the language or "unknown" if the method was not able to find out the language + * @return the language or null if the method was not able to find out the language */ - public String getLanguage(final String text, final int limit) { - - String ret = null; + public static String getLanguage(final String text, final int limit) { - final LanguageStatistics testStat = new LanguageStatistics("text"); - final char[] letter = new char[1]; - float letters = 0; int upperLimit = text.length(); if (upperLimit > limit) { upperLimit = limit; } + Identificator id = new Identificator(); + // count number of characters in text - for (int i = 0; i < upperLimit; i++) { - text.getChars(i, i + 1, letter, 0); - // only count if character is a letter - if ((letter[0]+"").matches("\\p{L}")) { - letter[0] = Character.toLowerCase(letter[0]); - testStat.put(letter[0], testStat.get(letter[0]) + 1); - letters++; - } + for (int i = 0; i < upperLimit; i++) id.inc(text.charAt(i)); + + return id.getLanguage(); + } + + public void inc(final char c) { + if (!Character.isLetter(c)) return; + Character cc = Character.toLowerCase(c); + Integer i = letter.get(cc); + if (i == null) { + letter.put(cc, 1); + } else { + letter.put(cc, i.intValue() + 1); } + letters++; + } + + public void add(String word) { + if (word == null) return; + for (int i = 0; i < word.length(); i++) inc(word.charAt(i)); + } + + public String getLanguage() { + + if (language != null) return language; // don't compute that twice + if (letters == 0) return null; // not enough information available + + final LanguageStatistics testStat = new LanguageStatistics("text"); // calculate percentage - Iterator iter = testStat.keySet().iterator(); Character character; Character maxChar = null; - float value = 0; - float max = 0; - while (iter.hasNext()) { - character = iter.next(); - value = testStat.get(character); - if (value > max) { + int count = 0; + int max = 0; + for (Map.Entry e: letter.entrySet()) { + character = e.getKey(); + count = e.getValue().intValue(); + if (count > max) { maxChar = character; - max = value; + max = count; } - testStat.put(character, (value / letters) * 100); + testStat.put(character, ((float) 100) * ((float) count) / ((float) letters)); } - + // create list with relevant languages final List relevantLanguages = new Vector (); for (int i = 0; i < languages.size(); i++) { @@ -107,56 +130,56 @@ public final class Identificator { } } - if (relevantLanguages.size() > 0) { + if (relevantLanguages.size() == 0) return null; - // compare characters in text with characters in statistics - final float[] offsetList = new float[relevantLanguages.size()]; - final int[] votesList = new int[relevantLanguages.size()]; - - iter = testStat.keySet().iterator(); - float minimum; - float offset = 0; - float valueCharacter; - int bestLanguage = -1; + // compare characters in text with characters in statistics + final float[] offsetList = new float[relevantLanguages.size()]; + final int[] votesList = new int[relevantLanguages.size()]; - while (iter.hasNext()) { - minimum = 100.1f; - character = iter.next(); - valueCharacter = testStat.get(character); - for (int i = 0; i < relevantLanguages.size(); i++) { - value = languages.get(relevantLanguages.get(i)).get(character); - offset = Math.abs(value - valueCharacter); - offsetList[i] = offsetList[i] + offset; - if (offset < minimum) { - minimum = offset; - bestLanguage = i; - } - } - votesList[bestLanguage] = ++votesList[bestLanguage]; - } - - // Now we can count how many votes each language got and how far it was away from the stats. - // If 2 languages have the same amount of votes, the one with the smaller offset wins. - int maxVotes = 0; - float minOffset = 100.1f; - for (int i = 0; i < votesList.length; i++) { - if ((votesList[i] == maxVotes && offsetList[i] < minOffset) || (votesList[i] > maxVotes)) { - maxVotes = votesList[i]; - minOffset = offsetList[i]; + Iterator iter = testStat.keySet().iterator(); + float minimum; + float offset = 0; + float valueCharacter; + int bestLanguage = -1; + float value; + + while (iter.hasNext()) { + minimum = 100.1f; + character = iter.next(); + valueCharacter = testStat.get(character); + for (int i = 0; i < relevantLanguages.size(); i++) { + value = languages.get(relevantLanguages.get(i)).get(character); + offset = Math.abs(value - valueCharacter); + offsetList[i] = offsetList[i] + offset; + if (offset < minimum) { + minimum = offset; bestLanguage = i; } } - - // Only return name of language of offset is smaller than 20%. This - // prevents a language beeing reported that has won the voting, but - // is still not the right language. - if (offset < 20) { - ret = languages.get(relevantLanguages.get(bestLanguage)).getName(); + votesList[bestLanguage] = ++votesList[bestLanguage]; + } + + // Now we can count how many votes each language got and how far it was away from the stats. + // If 2 languages have the same amount of votes, the one with the smaller offset wins. + int maxVotes = 0; + float minOffset = 100.1f; + for (int i = 0; i < votesList.length; i++) { + if ((votesList[i] == maxVotes && offsetList[i] < minOffset) || (votesList[i] > maxVotes)) { + maxVotes = votesList[i]; + minOffset = offsetList[i]; + bestLanguage = i; } - } - - return ret; + + // Only return name of language of offset is smaller than 20%. This + // prevents a language beeing reported that has won the voting, but + // is still not the right language. + if (offset < 20) { + language = languages.get(relevantLanguages.get(bestLanguage)).getName(); + return language; + } + + return null; } diff --git a/source/de/anomic/language/identification/LanguageStatistics.java b/source/de/anomic/language/identification/LanguageStatistics.java index 544cd97ea..ec8920ed2 100644 --- a/source/de/anomic/language/identification/LanguageStatistics.java +++ b/source/de/anomic/language/identification/LanguageStatistics.java @@ -89,8 +89,9 @@ public class LanguageStatistics { * @return the percentage */ public final float get(final char letter) { - if (stats.containsKey(letter)) { - return stats.get(letter); + Float f = stats.get(letter); + if (f != null) { + return f.floatValue(); } return 0; } diff --git a/source/de/anomic/language/identification/LanguageStatisticsHolder.java b/source/de/anomic/language/identification/LanguageStatisticsHolder.java index 5d8a0c6c5..7af41bb7c 100644 --- a/source/de/anomic/language/identification/LanguageStatisticsHolder.java +++ b/source/de/anomic/language/identification/LanguageStatisticsHolder.java @@ -28,6 +28,8 @@ import java.io.File; import java.io.FilenameFilter; import java.util.Vector; +import de.anomic.server.logging.serverLog; + /** * This class loads and provides several language statistics to the system. * It has been implemented as a Singleton since it has to access several @@ -64,6 +66,10 @@ public class LanguageStatisticsHolder extends Vector { private void addAllLanguagesInDirectory(final String directory) { final File folder = new File(directory); + if (!folder.exists()) { + serverLog.logSevere("LanguageStatistics", "the language statistics folder " + directory + " cannot be found"); + return; + } final FilenameFilter filter = new LanguageFilenameFilter(); final File[] allLanguageFiles = folder.listFiles(filter); diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 73d069182..057560230 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -52,6 +52,7 @@ import de.anomic.index.indexRWIEntry; import de.anomic.index.indexWord; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMSetTools; +import de.anomic.language.identification.Identificator; import de.anomic.yacy.yacyURL; public final class plasmaCondenser { @@ -110,6 +111,7 @@ public final class plasmaCondenser { public int RESULT_NUMB_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1; public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4); + Identificator languageIdentificator; public plasmaCondenser(final plasmaParserDocument document, final boolean indexText, final boolean indexMedia) throws UnsupportedEncodingException { // if addMedia == true, then all the media links are also parsed and added to the words @@ -119,6 +121,8 @@ public final class plasmaCondenser { this.words = new TreeMap(); this.RESULT_FLAGS = new kelondroBitfield(4); + this.languageIdentificator = new Identificator(); + //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia)); insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS); @@ -229,6 +233,7 @@ public final class plasmaCondenser { int pip = 0; while (wordenum.hasMoreElements()) { word = (new String(wordenum.nextElement())).toLowerCase(); + languageIdentificator.add(word); wprop = words.get(word); if (wprop == null) wprop = new indexWord(0, pip, phrase); if (wprop.flags == null) wprop.flags = flagstemplate.clone(); @@ -264,6 +269,10 @@ public final class plasmaCondenser { // returns the words as word/indexWord relation map return words; } + + public String language() { + return this.languageIdentificator.getLanguage(); + } public String intString(final int number, final int length) { String s = Integer.toString(number); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 22df773bf..205e56ad8 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -818,6 +818,13 @@ public final class plasmaWordIndex implements indexRI { final String dc_title = document.dc_title(); final yacyURL referrerURL = entry.referrerURL(); final Date docDate = entry.getModificationDate(); + String language = condenser.language(); + if (language == null) { + System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " FAILED, taking TLD"); + language = entry.url().language(); + } else { + System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " SUCCESS: " + language); + } // create a new loaded URL db entry final long ldate = System.currentTimeMillis(); @@ -836,7 +843,7 @@ public final class plasmaWordIndex implements indexRI { condenser.RESULT_NUMB_WORDS, // word count httpdProxyCacheEntry.docType(document.dc_format()), // doctype condenser.RESULT_FLAGS, // flags - yacyURL.language(entry.url()), // language + language, // language document.inboundLinks(), // inbound links document.outboundLinks(), // outbound links document.getAudiolinks().size(), // laudio @@ -856,7 +863,7 @@ public final class plasmaWordIndex implements indexRI { docDate, // document mod date document, // document content condenser, // document condenser - yacyURL.language(entry.url()), // document language + language, // document language httpdProxyCacheEntry.docType(document.dc_format()), // document type document.inboundLinks(), // inbound links document.outboundLinks() // outbound links diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index 19f1abcf7..a7466994f 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -847,9 +847,8 @@ public class yacyURL implements Serializable { } // language calculation - public static String language(final yacyURL url) { + public String language() { String language = "uk"; - final String host = url.getHost(); final int pos = host.lastIndexOf("."); if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase(); return language;