optimize condenser language detection a little.

langdetect probabilities take letter case into account, add words from
description and anchors etc. as is.
+ add it to javadoc
pull/91/head
reger 8 years ago
parent ae3717d087
commit b017e97421

@ -224,18 +224,19 @@ public final class Condenser extends Tokenizer {
try { try {
int pip = 0; int pip = 0;
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); word = wordenum.nextElement().toString();
if (useForLanguageIdentification) this.languageIdentificator.add(word); if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive
if (word.length() < 2) continue; if (word.length() < 2) continue;
word = word.toLowerCase(Locale.ENGLISH);
wprop = this.words.get(word); wprop = this.words.get(word);
if (wprop == null) wprop = new Word(0, pip, phrase); if (wprop == null) wprop = new Word(0, pip, phrase);
if (wprop.flags == null) wprop.flags = flagstemplate.clone(); if (wprop.flags == null) wprop.flags = flagstemplate.clone();
wprop.flags.set(flagpos, true); wprop.flags.set(flagpos, true);
this.words.put(word.toLowerCase(), wprop); this.words.put(word, wprop);
pip++; pip++;
this.RESULT_NUMB_WORDS++; this.RESULT_NUMB_WORDS++;
//this.RESULT_DIFF_WORDS++; //this.RESULT_DIFF_WORDS++;
} }
} finally { } finally {
wordenum.close(); wordenum.close();
wordenum = null; wordenum = null;

@ -56,7 +56,7 @@ public class Tokenizer {
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
//private Properties analysis; //private Properties analysis;
protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation (key: words are lowercase)
private final Set<String> synonyms; // a set of synonyms to the words private final Set<String> synonyms; // a set of synonyms to the words
protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
@ -167,7 +167,7 @@ public class Tokenizer {
// word does not yet exist, create new word entry // word does not yet exist, create new word entry
wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 ! wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
wsp.flags = this.RESULT_FLAGS.clone(); wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word.toLowerCase(), wsp); this.words.put(word, wsp);
} }
// we now have the unique handle of the word, put it into the sentence: // we now have the unique handle of the word, put it into the sentence:
wordInSentenceCounter++; wordInSentenceCounter++;
@ -214,7 +214,10 @@ public class Tokenizer {
// if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text. // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text.
this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0); this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
} }
/**
* @return returns the words as word/indexWord relation map. All words are lowercase.
*/
public Map<String, Word> words() { public Map<String, Word> words() {
// returns the words as word/indexWord relation map // returns the words as word/indexWord relation map
return this.words; return this.words;

@ -50,6 +50,11 @@ public final class Identificator {
} }
} }
/**
* Append a word to the text to be analyzed.
* Analysis takes letter case into account (this means word should not be upper- or lower cased)
* @param word
*/
public void add(final String word) { public void add(final String word) {
if (word == null) return; if (word == null) return;
this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars

@ -54,7 +54,6 @@ public class TokenizerTest {
VocabularyScraper scraper = null; VocabularyScraper scraper = null;
for (String text : testText) { for (String text : testText) {
Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper); Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
System.out.println(t.RESULT_NUMB_WORDS);
assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES); assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES);
} }
} }

Loading…
Cancel
Save