diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index c59de2b7d..35b55c9e7 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -195,6 +195,7 @@ public final class Condenser extends Tokenizer { } String text = document.getTextString(); + this.languageIdentificator.add(text); // use content text for language detection (before we added already title etc. for best identification content text is valuable) // create hashes for duplicate detection // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b diff --git a/source/net/yacy/document/language/Identificator.java b/source/net/yacy/document/language/Identificator.java index eff63dc34..a4d7ed759 100644 --- a/source/net/yacy/document/language/Identificator.java +++ b/source/net/yacy/document/language/Identificator.java @@ -38,7 +38,6 @@ import net.yacy.cora.util.ConcurrentLog; */ public final class Identificator { - private StringBuilder text; private Detector detector; private Language language; @@ -46,7 +45,6 @@ public final class Identificator { try { if(DetectorFactory.getLangList().isEmpty()) DetectorFactory.loadProfile(new File("langdetect").toString()); this.detector = DetectorFactory.create(); - this.text = new StringBuilder(); } catch (LangDetectException e) { ConcurrentLog.logException(e); } @@ -54,11 +52,10 @@ public final class Identificator { public void add(final String word) { if (word == null) return; - this.text.append(" " + word); + this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars } public String getLanguage() { - this.detector.append(this.text.toString()); try { ArrayList probabilities = this.detector.getProbabilities(); if(probabilities.isEmpty()) return null;