include use of condenser's content text for language detection.

Language identification may show poor performance on documents with short or no title but clear lang indication in text content. Using content text too improves lang detection. + remove double caching of text in Identificator
9 years ago · b65e2b527d
parent 756c55e6d1
commit b65e2b527d
2 changed files with 2 additions and 4 deletions
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -195,6 +195,7 @@ public final class Condenser extends Tokenizer {
        }
        String text = document.getTextString();
        this.languageIdentificator.add(text); // use content text for language detection (before we added already title etc. for best identification content text is valuable)
        // create hashes for duplicate detection
        // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
--- a/source/net/yacy/document/language/Identificator.java
+++ b/source/net/yacy/document/language/Identificator.java
@ -38,7 +38,6 @@ import net.yacy.cora.util.ConcurrentLog;
 */
 public final class Identificator {
    private StringBuilder text;
    private Detector detector;
    private Language language;
@ -46,7 +45,6 @@ public final class Identificator {
        try {
            if(DetectorFactory.getLangList().isEmpty()) DetectorFactory.loadProfile(new File("langdetect").toString());
            this.detector = DetectorFactory.create();
            this.text = new StringBuilder();
        } catch (LangDetectException e) {
            ConcurrentLog.logException(e);
        }
@ -54,11 +52,10 @@ public final class Identificator {
    public void add(final String word) {
        if (word == null) return;
-        this.text.append(" " + word);
+        this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars
    }
    public String getLanguage() {
        this.detector.append(this.text.toString());
        try {
            ArrayList<Language> probabilities = this.detector.getProbabilities();
            if(probabilities.isEmpty()) return null;