From b65e2b527dce50cab80ea63ff463306d3bed3ba8 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 7 Feb 2016 01:52:32 +0100 Subject: [PATCH] include use of condenser's content text for language detection. Language identification may show poor performance on documents with short or no title but clear lang indication in text content. Using content text too improves lang detection. + remove double caching of text in Identificator --- source/net/yacy/document/Condenser.java | 1 + source/net/yacy/document/language/Identificator.java | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index c59de2b7d..35b55c9e7 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -195,6 +195,7 @@ public final class Condenser extends Tokenizer { } String text = document.getTextString(); + this.languageIdentificator.add(text); // use content text for language detection (before we added already title etc. for best identification content text is valuable) // create hashes for duplicate detection // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b diff --git a/source/net/yacy/document/language/Identificator.java b/source/net/yacy/document/language/Identificator.java index eff63dc34..a4d7ed759 100644 --- a/source/net/yacy/document/language/Identificator.java +++ b/source/net/yacy/document/language/Identificator.java @@ -38,7 +38,6 @@ import net.yacy.cora.util.ConcurrentLog; */ public final class Identificator { - private StringBuilder text; private Detector detector; private Language language; @@ -46,7 +45,6 @@ public final class Identificator { try { if(DetectorFactory.getLangList().isEmpty()) DetectorFactory.loadProfile(new File("langdetect").toString()); this.detector = DetectorFactory.create(); - this.text = new StringBuilder(); } catch (LangDetectException e) { ConcurrentLog.logException(e); } @@ -54,11 +52,10 @@ public final class Identificator { public void add(final String word) { if (word == null) return; - this.text.append(" " + word); + this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars } public String getLanguage() { - this.detector.append(this.text.toString()); try { ArrayList probabilities = this.detector.getProbabilities(); if(probabilities.isEmpty()) return null;