optimize condenser language detection a little.

langdetect probabilities take letter case into account, add words from description and anchors etc. as is. + add it to javadoc
9 years ago · b017e97421
parent ae3717d087
commit b017e97421
4 changed files with 17 additions and 9 deletions
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -224,18 +224,19 @@ public final class Condenser extends Tokenizer {
        try {
 	        int pip = 0;
 	        while (wordenum.hasMoreElements()) {
-	            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
-	            if (useForLanguageIdentification) this.languageIdentificator.add(word);
-	            if (word.length() < 2) continue;
+	            word = wordenum.nextElement().toString();
+	            if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive
+                    if (word.length() < 2) continue;
+                    word = word.toLowerCase(Locale.ENGLISH);
 	            wprop = this.words.get(word);
 	            if (wprop == null) wprop = new Word(0, pip, phrase);
 	            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
 	            wprop.flags.set(flagpos, true);
-	            this.words.put(word.toLowerCase(), wprop);
+	            this.words.put(word, wprop);
 	            pip++;
 	            this.RESULT_NUMB_WORDS++;
 	            //this.RESULT_DIFF_WORDS++;
-	        }
+                }
        } finally {
        	wordenum.close();
        	wordenum = null;
--- a/source/net/yacy/document/Tokenizer.java
+++ b/source/net/yacy/document/Tokenizer.java
@ -56,7 +56,7 @@ public class Tokenizer {
    public  static final int flag_cat_hasapp        = 23; // the page refers to (at least one) application file

    //private Properties analysis;
-    protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation
+    protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation (key: words are lowercase)
    private final Set<String> synonyms; // a set of synonyms to the words
    protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
    
@ -167,7 +167,7 @@ public class Tokenizer {
                    // word does not yet exist, create new word entry
                    wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
                    wsp.flags = this.RESULT_FLAGS.clone();
-                    this.words.put(word.toLowerCase(), wsp);
+                    this.words.put(word, wsp);
                }
                // we now have the unique handle of the word, put it into the sentence:
                wordInSentenceCounter++;
@ -214,7 +214,10 @@ public class Tokenizer {
        // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text.
        this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
    }
-    
+
+    /**
+     * @return returns the words as word/indexWord relation map. All words are lowercase.
+     */
    public Map<String, Word> words() {
        // returns the words as word/indexWord relation map
        return this.words;
--- a/source/net/yacy/document/language/Identificator.java
+++ b/source/net/yacy/document/language/Identificator.java
@ -50,6 +50,11 @@ public final class Identificator {
        }
    }

+    /**
+     * Append a word to the text to be analyzed.
+     * Analysis takes letter case into account (this means word should not be upper- or lower cased)
+     * @param word
+     */
    public void add(final String word) {
        if (word == null) return;
        this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars
--- a/test/java/net/yacy/document/TokenizerTest.java
+++ b/test/java/net/yacy/document/TokenizerTest.java
@ -54,7 +54,6 @@ public class TokenizerTest {
        VocabularyScraper scraper = null;
        for (String text : testText) {
            Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
-            System.out.println(t.RESULT_NUMB_WORDS);
            assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES);
        }
    }