From b017e9742122c23646531934bcfd2b5b07e00da0 Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 6 Oct 2016 19:03:52 +0200 Subject: [PATCH] optimize condenser language detection a little. langdetect probabilities take letter case into account, add words from description and anchors etc. as is. + add it to javadoc --- source/net/yacy/document/Condenser.java | 11 ++++++----- source/net/yacy/document/Tokenizer.java | 9 ++++++--- source/net/yacy/document/language/Identificator.java | 5 +++++ test/java/net/yacy/document/TokenizerTest.java | 1 - 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index b7c741cd6..6c4137cd0 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -224,18 +224,19 @@ public final class Condenser extends Tokenizer { try { int pip = 0; while (wordenum.hasMoreElements()) { - word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); - if (useForLanguageIdentification) this.languageIdentificator.add(word); - if (word.length() < 2) continue; + word = wordenum.nextElement().toString(); + if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive + if (word.length() < 2) continue; + word = word.toLowerCase(Locale.ENGLISH); wprop = this.words.get(word); if (wprop == null) wprop = new Word(0, pip, phrase); if (wprop.flags == null) wprop.flags = flagstemplate.clone(); wprop.flags.set(flagpos, true); - this.words.put(word.toLowerCase(), wprop); + this.words.put(word, wprop); pip++; this.RESULT_NUMB_WORDS++; //this.RESULT_DIFF_WORDS++; - } + } } finally { wordenum.close(); wordenum = null; diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java index f730c8b43..8d0c8ad05 100644 --- a/source/net/yacy/document/Tokenizer.java +++ b/source/net/yacy/document/Tokenizer.java @@ -56,7 +56,7 @@ public class Tokenizer { public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file //private Properties analysis; - protected final Map words; // a string (the words) to (indexWord) - relation + protected final Map words; // a string (the words) to (indexWord) - relation (key: words are lowercase) private final Set synonyms; // a set of synonyms to the words protected final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging @@ -167,7 +167,7 @@ public class Tokenizer { // word does not yet exist, create new word entry wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 ! wsp.flags = this.RESULT_FLAGS.clone(); - this.words.put(word.toLowerCase(), wsp); + this.words.put(word, wsp); } // we now have the unique handle of the word, put it into the sentence: wordInSentenceCounter++; @@ -214,7 +214,10 @@ public class Tokenizer { // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text. this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0); } - + + /** + * @return returns the words as word/indexWord relation map. All words are lowercase. + */ public Map words() { // returns the words as word/indexWord relation map return this.words; diff --git a/source/net/yacy/document/language/Identificator.java b/source/net/yacy/document/language/Identificator.java index 98dcb5f34..6528f0182 100644 --- a/source/net/yacy/document/language/Identificator.java +++ b/source/net/yacy/document/language/Identificator.java @@ -50,6 +50,11 @@ public final class Identificator { } } + /** + * Append a word to the text to be analyzed. + * Analysis takes letter case into account (this means word should not be upper- or lower cased) + * @param word + */ public void add(final String word) { if (word == null) return; this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars diff --git a/test/java/net/yacy/document/TokenizerTest.java b/test/java/net/yacy/document/TokenizerTest.java index 8f5edd7c9..23e2fbb5f 100644 --- a/test/java/net/yacy/document/TokenizerTest.java +++ b/test/java/net/yacy/document/TokenizerTest.java @@ -54,7 +54,6 @@ public class TokenizerTest { VocabularyScraper scraper = null; for (String text : testText) { Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper); - System.out.println(t.RESULT_NUMB_WORDS); assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES); } }