include use of condenser's content text for language detection.

Language identification may show poor performance on documents with short or no
title but clear lang indication in text content. Using content text too
improves lang detection.
+ remove double caching of text in Identificator
pull/44/head
reger 9 years ago
parent 756c55e6d1
commit b65e2b527d

@ -195,6 +195,7 @@ public final class Condenser extends Tokenizer {
}
String text = document.getTextString();
this.languageIdentificator.add(text); // use content text for language detection (before we added already title etc. for best identification content text is valuable)
// create hashes for duplicate detection
// check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b

@ -38,7 +38,6 @@ import net.yacy.cora.util.ConcurrentLog;
*/
public final class Identificator {
private StringBuilder text;
private Detector detector;
private Language language;
@ -46,7 +45,6 @@ public final class Identificator {
try {
if(DetectorFactory.getLangList().isEmpty()) DetectorFactory.loadProfile(new File("langdetect").toString());
this.detector = DetectorFactory.create();
this.text = new StringBuilder();
} catch (LangDetectException e) {
ConcurrentLog.logException(e);
}
@ -54,11 +52,10 @@ public final class Identificator {
public void add(final String word) {
if (word == null) return;
this.text.append(" " + word);
this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars
}
public String getLanguage() {
this.detector.append(this.text.toString());
try {
ArrayList<Language> probabilities = this.detector.getProbabilities();
if(probabilities.isEmpty()) return null;

Loading…
Cancel
Save