- added language detection using metadata from documents: html and odt documents provide this information

- metadata and results from statistical analysis are compared and result is printed out as debug lines - added ranking profile for wanted language - added class with ISO 639 table, a list of all valid country codes that will be used for the language identification git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5187 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · bfcf9b7aa3
parent 3768a1bd32
commit bfcf9b7aa3
24 changed files with 308 additions and 25 deletions
--- a/htroot/Ranking_p.java
+++ b/htroot/Ranking_p.java
@ -75,6 +75,7 @@ public class Ranking_p {
 		rankingParameters.put(plasmaSearchRankingProfile.WORDSINTEXT, "Words In Text");
 		rankingParameters.put(plasmaSearchRankingProfile.WORDSINTITLE, "Words In Title");
 		rankingParameters.put(plasmaSearchRankingProfile.YBR, "YaCy Block Rank");
 		rankingParameters.put(plasmaSearchRankingProfile.LANGUAGE, "Preferred Language");
 	}
    private static serverObjects defaultValues() {
--- a/htroot/xml/util/getpageinfo_p.java
+++ b/htroot/xml/util/getpageinfo_p.java
@ -29,6 +29,7 @@ package xml.util;
 import java.io.IOException;
 import java.io.Writer;
 import java.net.MalformedURLException;
 import java.util.Set;
 import de.anomic.crawler.HTTPLoader;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -103,8 +104,9 @@ public class getpageinfo_p {
                    prop.put("tags", count);
                    // put description                    
                    prop.putHTML("desc", scraper.getDescription(), true);
-                    // put language 
+                    // put language
-                    prop.putHTML("lang", scraper.getContentLanguages()[0], true);
+                    Set<String> languages = scraper.getContentLanguages();
                    prop.putHTML("lang", (languages == null) ? "unknown" : languages.iterator().next(), true);
                } catch (final MalformedURLException e) { /* ignore this */
                } catch (final IOException e) { /* ignore this */
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -50,6 +50,7 @@ import de.anomic.http.httpRequestHeader;
 import de.anomic.server.serverCharBuffer;
 import de.anomic.server.serverFileUtils;
 import de.anomic.yacy.yacyURL;
 import de.anomic.tools.iso639;
 public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
@ -381,11 +382,21 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        return s;
    }
-    public String[] getContentLanguages() {
+    public HashSet<String> getContentLanguages() {
        String s = metas.get("content-language");
        if (s == null) s = metas.get("dc.language");
-        if (s == null) s = "";
+        if (s == null) return null;
-        return s.split(" |,");
+        HashSet<String> hs = new HashSet<String>();
        String[] cl = s.split(" |,");
        int p;
        for (int i = 0; i < cl.length; i++) {
            cl[i] = cl[i].toLowerCase();
            p = cl[i].indexOf('-');
            if (p > 0) cl[i] = cl[i].substring(0, p);
            if (iso639.exists(cl[i])) hs.add(cl[i]);
        }
        if (hs.size() == 0) return null;
        return hs;
    }
    public String[] getKeywords() {
--- a/source/de/anomic/index/indexContainerHeap.java
+++ b/source/de/anomic/index/indexContainerHeap.java
@ -118,6 +118,7 @@ public final class indexContainerHeap {
        int urlCount = 0;
        synchronized (cache) {
            for (final indexContainer container : new heapFileEntries(heapFile, this.payloadrow)) {
                // TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
                if (container == null) break;
                cache.put(container.getWordHash(), container);
                urlCount += container.size();
@ -252,6 +253,10 @@ public final class indexContainerHeap {
            }
        }
        /**
         * return an index container
         * because they may get very large, it is wise to deallocate some memory before calling next()
         */
        public indexContainer next() {
            final indexContainer n = this.nextContainer;
            this.nextContainer = next0();
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@ -70,6 +70,7 @@ public class docParser extends AbstractParser implements Parser {
                      mimeType,
                      "UTF-8",
                      null,
                      null,
                      ((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
                          replaceAll("\r\n"," ").
                          replaceAll("\n"," ").
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@ -32,7 +32,9 @@ import java.io.OutputStreamWriter;
 import java.io.Writer;
 import java.nio.charset.Charset;
 import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Hashtable;
 import java.util.Set;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipFile;
@ -89,6 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
            String docShortTitle  = null;
            String docLongTitle   = null;
            String docAuthor      = null;
            String docLanguage    = null;
            // opening the file as zip file
            final ZipFile zipFile= new ZipFile(dest);
@ -134,9 +137,14 @@ public class odtParser extends AbstractParser implements Parser {
                    docShortTitle  = metaData.getTitle();
                    docLongTitle   = metaData.getSubject();
                    docAuthor      = metaData.getCreator();
                    docLanguage    = metaData.getLanguage();
                }
            }
            // make the languages set
            Set<String> languages = new HashSet<String>(1);
            if (docLanguage != null) languages.add(docLanguage);
            // if there is no title availabe we generate one
            if (docLongTitle == null) {
                if (docShortTitle != null) {
@ -156,6 +164,7 @@ public class odtParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        "UTF-8",
                        languages,
                        docKeywords,
                        docLongTitle,
                        docAuthor,
@ -169,6 +178,7 @@ public class odtParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        "UTF-8",
                        languages,
                        docKeywords,
                        docLongTitle,
                        docAuthor,
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@ -143,6 +143,7 @@ public class pdfParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        "UTF-8",
                        null,
                        docKeywords,
                        (docTitle == null) ? docSubject : docTitle,
                        docAuthor,
@ -156,6 +157,7 @@ public class pdfParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        "UTF-8",
                        null,
                        docKeywords,
                        (docTitle == null) ? docSubject : docTitle,
                        docAuthor,
--- a/source/de/anomic/plasma/parser/ppt/pptParser.java
+++ b/source/de/anomic/plasma/parser/ppt/pptParser.java
@ -88,6 +88,7 @@ public class pptParser extends AbstractParser implements Parser {
                    mimeType,
                    "UTF-8",
                    null,
                    null,
                    ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
                    replaceAll("\r\n"," ").
                    replaceAll("\n"," ").
--- a/source/de/anomic/plasma/parser/ps/psParser.java
+++ b/source/de/anomic/plasma/parser/ps/psParser.java
@ -117,6 +117,7 @@ public class psParser extends AbstractParser implements Parser {
                    "UTF-8",
                    null,
                    null,
                    null,
                    "",
                    null,
                    null,
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@ -134,6 +134,7 @@ public class rpmParser extends AbstractParser implements Parser {
                    mimeType,
                    "UTF-8",
                    null,
                    null,
                    summary,
                    packager,
                    null,
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@ -158,6 +158,7 @@ public class rssParser extends AbstractParser implements Parser {
                    mimeType,
                    "UTF-8",
                    null,
                    null,
                    feedTitle,
                    (authors.length() > 0)?authors.toString(1,authors.length()):"",
                    feedSections.toArray(new String[feedSections.size()]),
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@ -75,6 +75,7 @@ public class rtfParser extends AbstractParser implements Parser {
                    mimeType,
                    "UTF-8",
                    null,
                    null,
                    ((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
                        replaceAll("\r\n"," ").
                        replaceAll("\n"," ").
--- a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java
+++ b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java
@ -63,7 +63,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
    public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset,
            final IInStream source, final long maxRamSize) throws ParserException, InterruptedException {
-        final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset);
+        final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset, null);
        Handler archive;
        super.theLogger.logFine("opening 7zip archive...");
        try {
--- a/source/de/anomic/plasma/parser/swf/swfParser.java
+++ b/source/de/anomic/plasma/parser/swf/swfParser.java
@ -107,6 +107,7 @@ public class swfParser extends AbstractParser implements Parser {
                    location,     // url of the source document
                    mimeType,     // the documents mime type
                    "UTF-8",      // charset of the document text
                    null,
                    null,          //keywords
                      ((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
                          replaceAll("\r\n"," ").
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@ -188,6 +188,7 @@ public class tarParser extends AbstractParser implements Parser {
                    location,
                    mimeType,
                    null,
                    null,
                    docKeywords.toString().split(" |,"),
                    docLongTitle.toString(),
                    "", // TODO: AUTHOR
@ -201,6 +202,7 @@ public class tarParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        null,
                        null,
                        docKeywords.toString().split(" |,"),
                        docLongTitle.toString(),
                        "", // TODO: AUTHOR
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@ -78,7 +78,7 @@ public class vcfParser extends AbstractParser implements Parser {
        return SUPPORTED_MIME_TYPES;
    }
-    public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
        try {
            final StringBuffer parsedTitle = new StringBuffer();
@ -213,7 +213,7 @@ public class vcfParser extends AbstractParser implements Parser {
                } else {
                    if (theLogger.isFinest()) this.theLogger.logFinest("Invalid data in vcf file" +
-                                             "\n\tURL: " + location +
+                                             "\n\tURL: " + url +
                                             "\n\tLine: " + line + 
                                             "\n\tLine-Nr: " + lineNr);
                }
@ -222,10 +222,11 @@ public class vcfParser extends AbstractParser implements Parser {
            final String[] sections = parsedNames.toArray(new String[parsedNames.size()]);
            final byte[] text = parsedDataText.toString().getBytes();
            final plasmaParserDocument theDoc = new plasmaParserDocument(
-                    location,                   // url of the source document
+                    url,                   // url of the source document
                    mimeType,                   // the documents mime type
                    null,
                    null,                       // a list of extracted keywords
                    null,                       // the language
                    parsedTitle.toString(),     // a long document title
                    "",                         // TODO: AUTHOR
                    sections,                   // an array of section headlines
@ -238,7 +239,7 @@ public class vcfParser extends AbstractParser implements Parser {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof ParserException) throw (ParserException) e;
-            throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),location);
+            throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),url);
        } 
    }
--- a/source/de/anomic/plasma/parser/xls/xlsParser.java
+++ b/source/de/anomic/plasma/parser/xls/xlsParser.java
@ -116,6 +116,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
                    mimeType,
                    "UTF-8",
                    null,
                    null,
                    ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
                    replaceAll("\r\n"," ").
                    replaceAll("\n"," ").
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@ -172,6 +172,7 @@ public class zipParser extends AbstractParser implements Parser {
                    location,
                    mimeType,
                    null,
                    null,
                    docKeywords.toString().split(" |,"),
                    docLongTitle.toString(),
                    "", // TODO: AUTHOR
@ -185,6 +186,7 @@ public class zipParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        null,
                        null,
                        docKeywords.toString().split(" |,"),
                        docLongTitle.toString(),
                        "", // TODO: AUTHOR
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -727,6 +727,7 @@ public final class plasmaParser {
                location,
                mimeType,
                charSet,
                scraper.getContentLanguages(),
                scraper.getKeywords(),
                scraper.getTitle(),
                scraper.getAuthor(),
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -36,6 +36,7 @@ import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -67,8 +68,9 @@ public class plasmaParserDocument {
    private boolean resorted;
    private InputStream textStream;
    private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
    private Set<String> languages;
-    protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
+    protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
                    final String[] keywords, final String title, final String author,
                    final String[] sections, final String abstrct,
                    final Object text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
@ -90,6 +92,7 @@ public class plasmaParserDocument {
        this.resorted = false;
        this.inboundLinks = -1;
        this.outboundLinks = -1;
        this.languages = languages;
        if (text == null) try {
            this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
@ -101,31 +104,48 @@ public class plasmaParserDocument {
        }
    }
-    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset) {
+    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages) {
-        this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null);
+        this(location, mimeType, charset, languages, null, null, null, null, null, (Object)null, null, null);
    }
-    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
+    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
                    final String[] keywords, final String title, final String author,
                    final String[] sections, final String abstrct,
                    final byte[] text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
-        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+        this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }
-    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
+    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
            final String[] keywords, final String title, final String author,
            final String[] sections, final String abstrct,
            final File text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
-        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+        this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }
-    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
+    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
            final String[] keywords, final String title, final String author,
            final String[] sections, final String abstrct,
            final serverCachedFileOutputStream text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
-        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+        this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }
    /**
     * compute a set of languages that this document contains
     * the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
     * if there are several languages defined in the document, the TLD is taken to check which one should be picked
     * If there is no metadata at all, null is returned
     * @return a string with a language name using the alpha-2 code of ISO 639
     */
    public String languageByMetadata() {
        if (this.languages == null) return null;
        if (this.languages.size() == 0) return null;
        if (this.languages.size() == 1) return languages.iterator().next();
        if (this.languages.contains(this.source.language())) return this.source.language();
        // now we are confused: the declared languages differ all from the TLD
        // just pick one of the languages that we have
        return languages.iterator().next();
    }
    /*
 DC according to rfc 5013
--- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
@ -57,6 +57,7 @@ public class plasmaSearchRankingProfile {
    public static final String CATHASVIDEO        = "cathasvideo";
    public static final String CATHASAPP          = "cathasapp";
    public static final String TERMFREQUENCY      = "tf";
    public static final String LANGUAGE           = "language";   // ranking of preferred language
    // post-sort predicates
    public static final String URLCOMPINTOPLIST   = "urlcompintoplist";
@ -74,7 +75,7 @@ public class plasmaSearchRankingProfile {
        coeff_appurl, coeff_app_dc_title, coeff_app_dc_creator, coeff_app_dc_subject, coeff_app_dc_description, coeff_appemph,
        coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp,
        coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer,
-        coeff_termfrequency;
+        coeff_termfrequency, coeff_language;
    public plasmaSearchRankingProfile(final int mediatype) {
        // set default-values
@ -109,6 +110,7 @@ public class plasmaSearchRankingProfile {
        coeff_urlcompintoplist   = 3;
        coeff_descrcompintoplist = 2;
        coeff_prefer             = 14;
        coeff_language           = 13;
    }
    public plasmaSearchRankingProfile(final String prefix, final String profile) {
@ -160,6 +162,7 @@ public class plasmaSearchRankingProfile {
            coeff_urlcompintoplist   = parseMap(coeff, URLCOMPINTOPLIST, coeff_urlcompintoplist);
            coeff_descrcompintoplist = parseMap(coeff, DESCRCOMPINTOPLIST, coeff_descrcompintoplist);
            coeff_prefer             = parseMap(coeff, PREFER, coeff_prefer);
            coeff_language           = parseMap(coeff, LANGUAGE, coeff_language);
        }
    }
@ -209,6 +212,7 @@ public class plasmaSearchRankingProfile {
        ext.put(prefix + CATHASVIDEO, Integer.toString(coeff_cathasvideo));
        ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp));
        ext.put(prefix + TERMFREQUENCY, Integer.toString(coeff_termfrequency));
        ext.put(prefix + LANGUAGE, Integer.toString(coeff_language));
        return ext;
    }
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -819,12 +819,29 @@ public final class plasmaWordIndex implements indexRI {
        final yacyURL referrerURL = entry.referrerURL();
        final Date docDate = entry.getModificationDate();
        String language = condenser.language();
        String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration
        if (language == null) {
-            System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " FAILED, taking TLD");
+            language = (bymetadata == null) ? entry.url().language() : bymetadata;
-            language = entry.url().language();
+            System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
        } else {
-            System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " SUCCESS: " + language);
+            if (language.equals("pl")) {
-            if (language.equals("pl")) language = entry.url().language(); // patch a bug TODO: remove this if bug is fixed
+                System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " HAS BUG: " + language);
                language = (bymetadata == null) ? entry.url().language() : bymetadata; // extra handling of this case: overwrite with bymetadata
            } else {
                if (bymetadata == null) {
                    if (language.equals(entry.url().language()))
                        System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language);
                    else {
                        System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")");
                        language = entry.url().language();
                    }
                } else {
                    if (language.equals(bymetadata))
                        System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
                    else
                        System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by metadata is " + bymetadata + ")");
                }
            }
        }
        // create a new loaded URL db entry
--- a/source/de/anomic/tools/iso639.java
+++ b/source/de/anomic/tools/iso639.java
@ -0,0 +1,197 @@
 // iso639.java
 // (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
 // first published 19.09.2008 on http://yacy.net
 //
 // This is a part of YaCy, a peer-to-peer based web search engine
 //
 // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
 // $LastChangedRevision: 1986 $
 // $LastChangedBy: orbiter $
 //
 // LICENSE
 // 
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation; either version 2 of the License, or
 // (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 // You should have received a copy of the GNU General Public License
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 package de.anomic.tools;
 import java.util.HashMap;
 public class iso639 {
    static final String[] codes = {
        "aa-Afar",
        "ab-Abkhazian",
        "af-Afrikaans",
        "am-Amharic",
        "ar-Arabic",
        "as-Assamese",
        "ay-Aymara",
        "az-Azerbaijani",
        "ba-Bashkir",
        "be-Byelorussian",
        "bg-Bulgarian",
        "bh-Bihari",
        "bi-Bislama",
        "bn-Bengali;-Bangla",
        "bo-Tibetan",
        "br-Breton",
        "ca-Catalan",
        "co-Corsican",
        "cs-Czech",
        "cy-Welsh",
        "da-Danish",
        "de-German",
        "dz-Bhutani",
        "el-Greek",
        "en-English",
        "eo-Esperanto",
        "es-Spanish",
        "et-Estonian",
        "eu-Basque",
        "fa-Persian",
        "fi-Finnish",
        "fj-Fiji",
        "fo-Faeroese",
        "fr-French",
        "fy-Frisian",
        "ga-Irish",
        "gd-Scots-Gaelic",
        "gl-Galician",
        "gn-Guarani",
        "gu-Gujarati",
        "ha-Hausa",
        "hi-Hindi",
        "hr-Croatian",
        "hu-Hungarian",
        "hy-Armenian",
        "ia-Interlingua",
        "ie-Interlingue",
        "ik-Inupiak",
        "in-Indonesian",
        "is-Icelandic",
        "it-Italian",
        "iw-Hebrew",
        "ja-Japanese",
        "ji-Yiddish",
        "jw-Javanese",
        "ka-Georgian",
        "kk-Kazakh",
        "kl-Greenlandic",
        "km-Cambodian",
        "kn-Kannada",
        "ko-Korean",
        "ks-Kashmiri",
        "ku-Kurdish",
        "ky-Kirghiz",
        "la-Latin",
        "ln-Lingala",
        "lo-Laothian",
        "lt-Lithuanian",
        "lv-Latvian,-Lettish",
        "mg-Malagasy",
        "mi-Maori",
        "mk-Macedonian",
        "ml-Malayalam",
        "mn-Mongolian",
        "mo-Moldavian",
        "mr-Marathi",
        "ms-Malay",
        "mt-Maltese",
        "my-Burmese",
        "na-Nauru",
        "ne-Nepali",
        "nl-Dutch",
        "no-Norwegian",
        "oc-Occitan",
        "om-(Afan)-Oromo",
        "or-Oriya",
        "pa-Punjabi",
        "pl-Polish",
        "ps-Pashto,-Pushto",
        "pt-Portuguese",
        "qu-Quechua",
        "rm-Rhaeto-Romance",
        "rn-Kirundi",
        "ro-Romanian",
        "ru-Russian",
        "rw-Kinyarwanda",
        "sa-Sanskrit",
        "sd-Sindhi",
        "sg-Sangro",
        "sh-Serbo-Croatian",
        "si-Singhalese",
        "sk-Slovak",
        "sl-Slovenian",
        "sm-Samoan",
        "sn-Shona",
        "so-Somali",
        "sq-Albanian",
        "sr-Serbian",
        "ss-Siswati",
        "st-Sesotho",
        "su-Sundanese",
        "sv-Swedish",
        "sw-Swahili",
        "ta-Tamil",
        "te-Tegulu",
        "tg-Tajik",
        "th-Thai",
        "ti-Tigrinya",
        "tk-Turkmen",
        "tl-Tagalog",
        "tn-Setswana",
        "to-Tonga",
        "tr-Turkish",
        "ts-Tsonga",
        "tt-Tatar",
        "tw-Twi",
        "uk-Ukrainian",
        "ur-Urdu",
        "uz-Uzbek",
        "vi-Vietnamese",
        "vo-Volapuk",
        "wo-Wolof",
        "xh-Xhosa",
        "yo-Yoruba",
        "zh-Chinese",
        "zu-Zulu"};
    static HashMap<String, String> mapping = new HashMap<String, String>();
    static {
        for (int i = 0; i < codes.length; i++) {
            mapping.put(codes[i].substring(0, 2), codes[i].substring(3));
        }
    }
    /**
     * get the name of the alpha-2 country code
     * @param code, the mnemonic of the country in alpha-2
     * @return the name of the country
     */
    public static final String country(String code) {
        return mapping.get(code.toLowerCase());
    }
    /**
     * see if the given country in alpha-2 country code exists
     * @param code, the mnemonic of the country in alpha-2
     * @return true if the code exists
     */
    public static final boolean exists(String code) {
        return mapping.containsKey(code.toLowerCase());
    }
 }
--- a/source/de/anomic/yacy/yacyURL.java
+++ b/source/de/anomic/yacy/yacyURL.java
@ -848,7 +848,7 @@ public class yacyURL implements Serializable {
    // language calculation
    public String language() {
-        String language = "uk";
+        String language = "en";
        final int pos = host.lastIndexOf(".");
        if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase();
        return language;