From b017e9742122c23646531934bcfd2b5b07e00da0 Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Thu, 6 Oct 2016 19:03:52 +0200
Subject: [PATCH] optimize condenser language detection a little. langdetect
 probabilities take letter case into account, add words from description and
 anchors etc. as is. + add it to javadoc

---
 source/net/yacy/document/Condenser.java              | 11 ++++++-----
 source/net/yacy/document/Tokenizer.java              |  9 ++++++---
 source/net/yacy/document/language/Identificator.java |  5 +++++
 test/java/net/yacy/document/TokenizerTest.java       |  1 -
 4 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java
index b7c741cd6..6c4137cd0 100644
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@@ -224,18 +224,19 @@ public final class Condenser extends Tokenizer {
         try {
 	        int pip = 0;
 	        while (wordenum.hasMoreElements()) {
-	            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
-	            if (useForLanguageIdentification) this.languageIdentificator.add(word);
-	            if (word.length() < 2) continue;
+	            word = wordenum.nextElement().toString();
+	            if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive
+                    if (word.length() < 2) continue;
+                    word = word.toLowerCase(Locale.ENGLISH);
 	            wprop = this.words.get(word);
 	            if (wprop == null) wprop = new Word(0, pip, phrase);
 	            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
 	            wprop.flags.set(flagpos, true);
-	            this.words.put(word.toLowerCase(), wprop);
+	            this.words.put(word, wprop);
 	            pip++;
 	            this.RESULT_NUMB_WORDS++;
 	            //this.RESULT_DIFF_WORDS++;
-	        }
+                }
         } finally {
         	wordenum.close();
         	wordenum = null;
diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java
index f730c8b43..8d0c8ad05 100644
--- a/source/net/yacy/document/Tokenizer.java
+++ b/source/net/yacy/document/Tokenizer.java
@@ -56,7 +56,7 @@ public class Tokenizer {
     public  static final int flag_cat_hasapp        = 23; // the page refers to (at least one) application file
 
     //private Properties analysis;
-    protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation
+    protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation (key: words are lowercase)
     private final Set<String> synonyms; // a set of synonyms to the words
     protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
     
@@ -167,7 +167,7 @@ public class Tokenizer {
                     // word does not yet exist, create new word entry
                     wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
                     wsp.flags = this.RESULT_FLAGS.clone();
-                    this.words.put(word.toLowerCase(), wsp);
+                    this.words.put(word, wsp);
                 }
                 // we now have the unique handle of the word, put it into the sentence:
                 wordInSentenceCounter++;
@@ -214,7 +214,10 @@ public class Tokenizer {
         // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text.
         this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
     }
-    
+
+    /**
+     * @return returns the words as word/indexWord relation map. All words are lowercase.
+     */
     public Map<String, Word> words() {
         // returns the words as word/indexWord relation map
         return this.words;
diff --git a/source/net/yacy/document/language/Identificator.java b/source/net/yacy/document/language/Identificator.java
index 98dcb5f34..6528f0182 100644
--- a/source/net/yacy/document/language/Identificator.java
+++ b/source/net/yacy/document/language/Identificator.java
@@ -50,6 +50,11 @@ public final class Identificator {
         }
     }
 
+    /**
+     * Append a word to the text to be analyzed.
+     * Analysis takes letter case into account (this means word should not be upper- or lower cased)
+     * @param word
+     */
     public void add(final String word) {
         if (word == null) return;
         this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars
diff --git a/test/java/net/yacy/document/TokenizerTest.java b/test/java/net/yacy/document/TokenizerTest.java
index 8f5edd7c9..23e2fbb5f 100644
--- a/test/java/net/yacy/document/TokenizerTest.java
+++ b/test/java/net/yacy/document/TokenizerTest.java
@@ -54,7 +54,6 @@ public class TokenizerTest {
         VocabularyScraper scraper = null;
         for (String text : testText) {
             Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
-            System.out.println(t.RESULT_NUMB_WORDS);
             assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES);
         }
     }