override detected language (statistic langdetect) only with TLD determided

language if langdetect probability is not high. + additionally truncate zh-cn / zh-tw returned by langdetect to 2 char ISO639-1 zh used by YaCy
9 years ago · 6f0b073bf3
parent b65e2b527d
commit 6f0b073bf3
3 changed files with 46 additions and 17 deletions
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -266,6 +266,14 @@ public final class Condenser extends Tokenizer {
        return this.languageIdentificator.getLanguage();
    }

+    /**
+     * get the probability of the detected language received by {@link #language()}
+     * @return 0.0 to 1.0
+     */
+    public double languageProbability() {
+        return this.languageIdentificator.getProbability();
+    }
+
    public static void main(final String[] args) {
        // read a property file and convert them into configuration lines
        try {
--- a/source/net/yacy/document/language/Identificator.java
+++ b/source/net/yacy/document/language/Identificator.java
@ -55,6 +55,12 @@ public final class Identificator {
        this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars
    }

+    /**
+     * Get the detected language with highest probability
+     * if detection probability is above 0.3 (30%)
+     * Underlaying detector differentiates zh-cn and zh-tw, these are returned as zh here.
+     * @return 2 char language code (ISO 639-1)
+     */
    public String getLanguage() {
        try {
            ArrayList<Language> probabilities = this.detector.getProbabilities();
@ -67,11 +73,25 @@ public final class Identificator {
        }
        // Return language only if probability is higher than 30% to account for missing language profiles
        if (this.language.prob > 0.3) {
-            return this.language.lang;
+            if (this.language.lang.length() == 2)
+                return this.language.lang;
+            else
+                return this.language.lang.substring(0,2);
        }

        return null;

    }

+    /**
+     * Get the probability of the detected language (returned by {@link #getLanguage()})
+     * @return 0.0 to 1.0
+     */
+    public double getProbability() {
+        if (language != null) {
+            return language.prob;
+        } else
+            return 0.0;
+    }
+
 }
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -502,17 +502,19 @@ public class Segment {
            language = (bymetadata == null) ? url.language() : bymetadata;
        } else {
            if (bymetadata == null) {
-                // two possible results: compare and report conflicts
-                if (!language.equals(url.language())) {
-                    // see if we have a hint in the url that the statistic was right
-                    final String u = urlNormalform.toLowerCase();
-                    String ISO639_country = ISO639.country(language);
-                    if (u.contains("/" + language + "/") ||
-                        (ISO639_country != null && u.contains("/" + ISO639.country(language).toLowerCase() + "/"))) {
-                        // this is a strong hint that the statistics was in fact correct
-                    } else {
-                        // no confirmation using the url, use the TLD
-                        language = url.language();
+                if (condenser.languageProbability() < 0.9) { // if probability of statistic is not very high, examine url
+                    // two possible results: compare and report conflicts
+                    if (!language.equals(url.language())) {
+                        // see if we have a hint in the url that the statistic was right
+                        final String u = urlNormalform.toLowerCase();
+                        String ISO639_country = ISO639.country(language);
+                        if (u.contains("/" + language + "/") ||
+                            (ISO639_country != null && u.contains("/" + ISO639.country(language).toLowerCase() + "/"))) {
+                            // this is a strong hint that the statistics was in fact correct
+                        } else {
+                            // no confirmation using the url, use the TLD
+                            language = url.language();
+                        }
                    }
                }
            } else {
@ -682,13 +684,12 @@ public class Segment {
        final long storageEndTime = System.currentTimeMillis();

        // STORE PAGE INDEX INTO WORD INDEX DB
-        int outlinksSame = document.inboundLinks().size();
-        int outlinksOther = document.outboundLinks().size();
-        final int urlLength = urlNormalform.length();
-        final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
-
        // create a word prototype which is re-used for all entries
        if ((this.termIndex != null && storeToRWI) || searchEvent != null) {
+            final int outlinksSame = document.inboundLinks().size();
+            final int outlinksOther = document.outboundLinks().size();
+            final int urlLength = urlNormalform.length();
+            final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
            final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val
            final WordReferenceRow ientry = new WordReferenceRow(
                            url.hash(),