From f667b9c2891bf295bf6e8a31d9f05eae9cbebfc8 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 19 Jun 2011 13:31:10 +0000 Subject: [PATCH] enhanced identificator: using AtomicInteger for counter git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7785 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../yacy/document/language/Identificator.java | 83 ++++++++++--------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/source/net/yacy/document/language/Identificator.java b/source/net/yacy/document/language/Identificator.java index 3ac9cfd63..62051a430 100644 --- a/source/net/yacy/document/language/Identificator.java +++ b/source/net/yacy/document/language/Identificator.java @@ -29,6 +29,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Vector; +import java.util.concurrent.atomic.AtomicInteger; /** * This class can try to identify the language a text is written in. @@ -36,15 +37,15 @@ import java.util.Vector; public final class Identificator { private static final LanguageStatisticsHolder languages = LanguageStatisticsHolder.getInstance(); - - private final Map letter; + + private final Map letter; private int letters; private String language; - + public Identificator() { - letter = new HashMap(); - letters = 0; - language = null; + this.letter = new HashMap(); + this.letters = 0; + this.language = null; } /** @@ -58,7 +59,7 @@ public final class Identificator { // only test the first 100000 characters of a text return getLanguage(text, 100000); } - + /** * This method tries to return the language a text is written in. The method will * use the number characters defined in the parameter limit. @@ -67,82 +68,82 @@ public final class Identificator { * @return the language or null if the method was not able to find out the language */ public static String getLanguage(final String text, final int limit) { - + int upperLimit = text.length(); if (upperLimit > limit) { upperLimit = limit; } - Identificator id = new Identificator(); - + final Identificator id = new Identificator(); + // count number of characters in text for (int i = 0; i < upperLimit; i++) id.inc(text.charAt(i)); - + return id.getLanguage(); } - + public void inc(final char c) { if (!Character.isLetter(c)) return; - Character cc = Character.toLowerCase(c); - Integer i = letter.get(cc); + final Character cc = Character.toLowerCase(c); + final AtomicInteger i = this.letter.get(cc); if (i == null) { - letter.put(cc, 1); + this.letter.put(cc, new AtomicInteger(1)); } else { - letter.put(cc, i.intValue() + 1); + i.incrementAndGet(); } - letters++; + this.letters++; } - - public void add(String word) { + + public void add(final String word) { if (word == null) return; for (int i = 0; i < word.length(); i++) inc(word.charAt(i)); } - + public String getLanguage() { - - if (language != null) return language; // don't compute that twice - if (letters == 0) return null; // not enough information available - + + if (this.language != null) return this.language; // don't compute that twice + if (this.letters == 0) return null; // not enough information available + final LanguageStatistics testStat = new LanguageStatistics("text"); - + // calculate percentage Character character; Character maxChar = null; int count = 0; int max = 0; - for (Map.Entry e: letter.entrySet()) { + for (final Map.Entry e: this.letter.entrySet()) { character = e.getKey(); count = e.getValue().intValue(); if (count > max) { maxChar = character; max = count; } - testStat.put(character, ((float) 100) * ((float) count) / ((float) letters)); + testStat.put(character, ((float) 100) * ((float) count) / (this.letters)); } - + // create list with relevant languages final List relevantLanguages = new Vector (); for (int i = 0; i < languages.size(); i++) { - + // only languages that contain the most common character in the text will be tested if (languages.get(i).contains(maxChar)) { relevantLanguages.add(i); } } - + if (relevantLanguages.isEmpty()) return null; - + // compare characters in text with characters in statistics final float[] offsetList = new float[relevantLanguages.size()]; final int[] votesList = new int[relevantLanguages.size()]; - Iterator iter = testStat.keySet().iterator(); + final Iterator iter = testStat.keySet().iterator(); float minimum; float offset = 0; float valueCharacter; int bestLanguage = -1; float value; - + while (iter.hasNext()) { minimum = 100.1f; character = iter.next(); @@ -158,7 +159,7 @@ public final class Identificator { } votesList[bestLanguage] = ++votesList[bestLanguage]; } - + // Now we can count how many votes each language got and how far it was away from the stats. // If 2 languages have the same amount of votes, the one with the smaller offset wins. int maxVotes = 0; @@ -170,17 +171,17 @@ public final class Identificator { bestLanguage = i; } } - - // Only return name of language of offset is smaller than 20%. This + + // Only return name of language of offset is smaller than 20%. This // prevents a language beeing reported that has won the voting, but // is still not the right language. if (offset < 20) { - language = languages.get(relevantLanguages.get(bestLanguage)).getName(); - return language; + this.language = languages.get(relevantLanguages.get(bestLanguage)).getName(); + return this.language; } - + return null; - + } - + }