- added language detection using metadata from documents: html and odt documents provide this information

- metadata and results from statistical analysis are compared and result is printed out as debug lines - added ranking profile for wanted language - added class with ISO 639 table, a list of all valid country codes that will be used for the language identification git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5187 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · bfcf9b7aa3
parent 3768a1bd32
commit bfcf9b7aa3
24 changed files with 308 additions and 25 deletions
--- a/htroot/Ranking_p.java
+++ b/htroot/Ranking_p.java
@ -75,6 +75,7 @@ public class Ranking_p {
 		rankingParameters.put(plasmaSearchRankingProfile.WORDSINTEXT, "Words In Text");
 		rankingParameters.put(plasmaSearchRankingProfile.WORDSINTITLE, "Words In Title");
 		rankingParameters.put(plasmaSearchRankingProfile.YBR, "YaCy Block Rank");
+		rankingParameters.put(plasmaSearchRankingProfile.LANGUAGE, "Preferred Language");
 	}

    private static serverObjects defaultValues() {
--- a/htroot/xml/util/getpageinfo_p.java
+++ b/htroot/xml/util/getpageinfo_p.java
@ -29,6 +29,7 @@ package xml.util;
 import java.io.IOException;
 import java.io.Writer;
 import java.net.MalformedURLException;
+import java.util.Set;

 import de.anomic.crawler.HTTPLoader;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -103,8 +104,9 @@ public class getpageinfo_p {
                    prop.put("tags", count);
                    // put description                    
                    prop.putHTML("desc", scraper.getDescription(), true);
-                    // put language 
-                    prop.putHTML("lang", scraper.getContentLanguages()[0], true);
+                    // put language
+                    Set<String> languages = scraper.getContentLanguages();
+                    prop.putHTML("lang", (languages == null) ? "unknown" : languages.iterator().next(), true);

                } catch (final MalformedURLException e) { /* ignore this */
                } catch (final IOException e) { /* ignore this */
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -50,6 +50,7 @@ import de.anomic.http.httpRequestHeader;
 import de.anomic.server.serverCharBuffer;
 import de.anomic.server.serverFileUtils;
 import de.anomic.yacy.yacyURL;
+import de.anomic.tools.iso639;

 public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {

@ -381,11 +382,21 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        return s;
    }
    
-    public String[] getContentLanguages() {
+    public HashSet<String> getContentLanguages() {
        String s = metas.get("content-language");
        if (s == null) s = metas.get("dc.language");
-        if (s == null) s = "";
-        return s.split(" |,");
+        if (s == null) return null;
+        HashSet<String> hs = new HashSet<String>();
+        String[] cl = s.split(" |,");
+        int p;
+        for (int i = 0; i < cl.length; i++) {
+            cl[i] = cl[i].toLowerCase();
+            p = cl[i].indexOf('-');
+            if (p > 0) cl[i] = cl[i].substring(0, p);
+            if (iso639.exists(cl[i])) hs.add(cl[i]);
+        }
+        if (hs.size() == 0) return null;
+        return hs;
    }
    
    public String[] getKeywords() {
--- a/source/de/anomic/index/indexContainerHeap.java
+++ b/source/de/anomic/index/indexContainerHeap.java
@ -118,6 +118,7 @@ public final class indexContainerHeap {
        int urlCount = 0;
        synchronized (cache) {
            for (final indexContainer container : new heapFileEntries(heapFile, this.payloadrow)) {
+                // TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
                if (container == null) break;
                cache.put(container.getWordHash(), container);
                urlCount += container.size();
@ -252,6 +253,10 @@ public final class indexContainerHeap {
            }
        }
        
+        /**
+         * return an index container
+         * because they may get very large, it is wise to deallocate some memory before calling next()
+         */
        public indexContainer next() {
            final indexContainer n = this.nextContainer;
            this.nextContainer = next0();
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@ -70,6 +70,7 @@ public class docParser extends AbstractParser implements Parser {
                      mimeType,
                      "UTF-8",
                      null,
+                      null,
                      ((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
                          replaceAll("\r\n"," ").
                          replaceAll("\n"," ").
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@ -32,7 +32,9 @@ import java.io.OutputStreamWriter;
 import java.io.Writer;
 import java.nio.charset.Charset;
 import java.util.Enumeration;
+import java.util.HashSet;
 import java.util.Hashtable;
+import java.util.Set;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipFile;

@ -89,6 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
            String docShortTitle  = null;
            String docLongTitle   = null;
            String docAuthor      = null;
+            String docLanguage    = null;
            
            // opening the file as zip file
            final ZipFile zipFile= new ZipFile(dest);
@ -134,9 +137,14 @@ public class odtParser extends AbstractParser implements Parser {
                    docShortTitle  = metaData.getTitle();
                    docLongTitle   = metaData.getSubject();
                    docAuthor      = metaData.getCreator();
+                    docLanguage    = metaData.getLanguage();
                }
            }
            
+            // make the languages set
+            Set<String> languages = new HashSet<String>(1);
+            if (docLanguage != null) languages.add(docLanguage);
+            
            // if there is no title availabe we generate one
            if (docLongTitle == null) {
                if (docShortTitle != null) {
@ -156,6 +164,7 @@ public class odtParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        "UTF-8",
+                        languages,
                        docKeywords,
                        docLongTitle,
                        docAuthor,
@ -169,6 +178,7 @@ public class odtParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        "UTF-8",
+                        languages,
                        docKeywords,
                        docLongTitle,
                        docAuthor,
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@ -143,6 +143,7 @@ public class pdfParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        "UTF-8",
+                        null,
                        docKeywords,
                        (docTitle == null) ? docSubject : docTitle,
                        docAuthor,
@ -156,6 +157,7 @@ public class pdfParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        "UTF-8",
+                        null,
                        docKeywords,
                        (docTitle == null) ? docSubject : docTitle,
                        docAuthor,
--- a/source/de/anomic/plasma/parser/ppt/pptParser.java
+++ b/source/de/anomic/plasma/parser/ppt/pptParser.java
@ -88,6 +88,7 @@ public class pptParser extends AbstractParser implements Parser {
                    mimeType,
                    "UTF-8",
                    null,
+                    null,
                    ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
                    replaceAll("\r\n"," ").
                    replaceAll("\n"," ").
--- a/source/de/anomic/plasma/parser/ps/psParser.java
+++ b/source/de/anomic/plasma/parser/ps/psParser.java
@ -117,6 +117,7 @@ public class psParser extends AbstractParser implements Parser {
                    "UTF-8",
                    null,
                    null,
+                    null,
                    "",
                    null,
                    null,
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@ -134,6 +134,7 @@ public class rpmParser extends AbstractParser implements Parser {
                    mimeType,
                    "UTF-8",
                    null,
+                    null,
                    summary,
                    packager,
                    null,
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@ -158,6 +158,7 @@ public class rssParser extends AbstractParser implements Parser {
                    mimeType,
                    "UTF-8",
                    null,
+                    null,
                    feedTitle,
                    (authors.length() > 0)?authors.toString(1,authors.length()):"",
                    feedSections.toArray(new String[feedSections.size()]),
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@ -75,6 +75,7 @@ public class rtfParser extends AbstractParser implements Parser {
                    mimeType,
                    "UTF-8",
                    null,
+                    null,
                    ((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
                        replaceAll("\r\n"," ").
                        replaceAll("\n"," ").
--- a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java
+++ b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java
@ -63,7 +63,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
    
    public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset,
            final IInStream source, final long maxRamSize) throws ParserException, InterruptedException {
-        final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset);
+        final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset, null);
        Handler archive;
        super.theLogger.logFine("opening 7zip archive...");
        try {
--- a/source/de/anomic/plasma/parser/swf/swfParser.java
+++ b/source/de/anomic/plasma/parser/swf/swfParser.java
@ -107,6 +107,7 @@ public class swfParser extends AbstractParser implements Parser {
                    location,     // url of the source document
                    mimeType,     // the documents mime type
                    "UTF-8",      // charset of the document text
+                    null,
                    null,          //keywords
                      ((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
                          replaceAll("\r\n"," ").
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@ -188,6 +188,7 @@ public class tarParser extends AbstractParser implements Parser {
                    location,
                    mimeType,
                    null,
+                    null,
                    docKeywords.toString().split(" |,"),
                    docLongTitle.toString(),
                    "", // TODO: AUTHOR
@ -201,6 +202,7 @@ public class tarParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        null,
+                        null,
                        docKeywords.toString().split(" |,"),
                        docLongTitle.toString(),
                        "", // TODO: AUTHOR
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@ -78,7 +78,7 @@ public class vcfParser extends AbstractParser implements Parser {
        return SUPPORTED_MIME_TYPES;
    }
    
-    public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
        
        try {
            final StringBuffer parsedTitle = new StringBuffer();
@ -213,7 +213,7 @@ public class vcfParser extends AbstractParser implements Parser {
                    
                } else {
                    if (theLogger.isFinest()) this.theLogger.logFinest("Invalid data in vcf file" +
-                                             "\n\tURL: " + location +
+                                             "\n\tURL: " + url +
                                             "\n\tLine: " + line + 
                                             "\n\tLine-Nr: " + lineNr);
                }
@ -222,10 +222,11 @@ public class vcfParser extends AbstractParser implements Parser {
            final String[] sections = parsedNames.toArray(new String[parsedNames.size()]);
            final byte[] text = parsedDataText.toString().getBytes();
            final plasmaParserDocument theDoc = new plasmaParserDocument(
-                    location,                   // url of the source document
+                    url,                   // url of the source document
                    mimeType,                   // the documents mime type
                    null,
                    null,                       // a list of extracted keywords
+                    null,                       // the language
                    parsedTitle.toString(),     // a long document title
                    "",                         // TODO: AUTHOR
                    sections,                   // an array of section headlines
@ -238,7 +239,7 @@ public class vcfParser extends AbstractParser implements Parser {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof ParserException) throw (ParserException) e;
            
-            throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),location);
+            throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),url);
        } 
    }
    
--- a/source/de/anomic/plasma/parser/xls/xlsParser.java
+++ b/source/de/anomic/plasma/parser/xls/xlsParser.java
@ -116,6 +116,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
                    mimeType,
                    "UTF-8",
                    null,
+                    null,
                    ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
                    replaceAll("\r\n"," ").
                    replaceAll("\n"," ").
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@ -172,6 +172,7 @@ public class zipParser extends AbstractParser implements Parser {
                    location,
                    mimeType,
                    null,
+                    null,
                    docKeywords.toString().split(" |,"),
                    docLongTitle.toString(),
                    "", // TODO: AUTHOR
@ -185,6 +186,7 @@ public class zipParser extends AbstractParser implements Parser {
                        location,
                        mimeType,
                        null,
+                        null,
                        docKeywords.toString().split(" |,"),
                        docLongTitle.toString(),
                        "", // TODO: AUTHOR
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -727,6 +727,7 @@ public final class plasmaParser {
                location,
                mimeType,
                charSet,
+                scraper.getContentLanguages(),
                scraper.getKeywords(),
                scraper.getTitle(),
                scraper.getAuthor(),
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -36,6 +36,7 @@ import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeSet;

 import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -67,8 +68,9 @@ public class plasmaParserDocument {
    private boolean resorted;
    private InputStream textStream;
    private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
+    private Set<String> languages;
    
-    protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
+    protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
                    final String[] keywords, final String title, final String author,
                    final String[] sections, final String abstrct,
                    final Object text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
@ -90,6 +92,7 @@ public class plasmaParserDocument {
        this.resorted = false;
        this.inboundLinks = -1;
        this.outboundLinks = -1;
+        this.languages = languages;
        
        if (text == null) try {
            this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
@ -101,31 +104,48 @@ public class plasmaParserDocument {
        }
    }
    
-    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset) {
-        this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null);
+    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages) {
+        this(location, mimeType, charset, languages, null, null, null, null, null, (Object)null, null, null);
    }
    
-    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
+    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
                    final String[] keywords, final String title, final String author,
                    final String[] sections, final String abstrct,
                    final byte[] text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
-        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+        this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }
    
-    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
+    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
            final String[] keywords, final String title, final String author,
            final String[] sections, final String abstrct,
            final File text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
-        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+        this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }
    
-    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
+    public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
            final String[] keywords, final String title, final String author,
            final String[] sections, final String abstrct,
            final serverCachedFileOutputStream text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
-        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+        this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }

+    /**
+     * compute a set of languages that this document contains
+     * the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
+     * if there are several languages defined in the document, the TLD is taken to check which one should be picked
+     * If there is no metadata at all, null is returned
+     * @return a string with a language name using the alpha-2 code of ISO 639
+     */
+    public String languageByMetadata() {
+        if (this.languages == null) return null;
+        if (this.languages.size() == 0) return null;
+        if (this.languages.size() == 1) return languages.iterator().next();
+        if (this.languages.contains(this.source.language())) return this.source.language();
+        // now we are confused: the declared languages differ all from the TLD
+        // just pick one of the languages that we have
+        return languages.iterator().next();
+    }
+    
    /*
 DC according to rfc 5013

--- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
@ -57,6 +57,7 @@ public class plasmaSearchRankingProfile {
    public static final String CATHASVIDEO        = "cathasvideo";
    public static final String CATHASAPP          = "cathasapp";
    public static final String TERMFREQUENCY      = "tf";
+    public static final String LANGUAGE           = "language";   // ranking of preferred language

    // post-sort predicates
    public static final String URLCOMPINTOPLIST   = "urlcompintoplist";
@ -74,7 +75,7 @@ public class plasmaSearchRankingProfile {
        coeff_appurl, coeff_app_dc_title, coeff_app_dc_creator, coeff_app_dc_subject, coeff_app_dc_description, coeff_appemph,
        coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp,
        coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer,
-        coeff_termfrequency;
+        coeff_termfrequency, coeff_language;
    
    public plasmaSearchRankingProfile(final int mediatype) {
        // set default-values
@ -109,6 +110,7 @@ public class plasmaSearchRankingProfile {
        coeff_urlcompintoplist   = 3;
        coeff_descrcompintoplist = 2;
        coeff_prefer             = 14;
+        coeff_language           = 13;
    }
    
    public plasmaSearchRankingProfile(final String prefix, final String profile) {
@ -160,6 +162,7 @@ public class plasmaSearchRankingProfile {
            coeff_urlcompintoplist   = parseMap(coeff, URLCOMPINTOPLIST, coeff_urlcompintoplist);
            coeff_descrcompintoplist = parseMap(coeff, DESCRCOMPINTOPLIST, coeff_descrcompintoplist);
            coeff_prefer             = parseMap(coeff, PREFER, coeff_prefer);
+            coeff_language           = parseMap(coeff, LANGUAGE, coeff_language);
        }
    }
    
@ -209,6 +212,7 @@ public class plasmaSearchRankingProfile {
        ext.put(prefix + CATHASVIDEO, Integer.toString(coeff_cathasvideo));
        ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp));
        ext.put(prefix + TERMFREQUENCY, Integer.toString(coeff_termfrequency));
+        ext.put(prefix + LANGUAGE, Integer.toString(coeff_language));
        return ext;
    }
    
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -819,12 +819,29 @@ public final class plasmaWordIndex implements indexRI {
        final yacyURL referrerURL = entry.referrerURL();
        final Date docDate = entry.getModificationDate();
        String language = condenser.language();
+        String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration
        if (language == null) {
-            System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " FAILED, taking TLD");
-            language = entry.url().language();
+            language = (bymetadata == null) ? entry.url().language() : bymetadata;
+            System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
        } else {
-            System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " SUCCESS: " + language);
-            if (language.equals("pl")) language = entry.url().language(); // patch a bug TODO: remove this if bug is fixed
+            if (language.equals("pl")) {
+                System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " HAS BUG: " + language);
+                language = (bymetadata == null) ? entry.url().language() : bymetadata; // extra handling of this case: overwrite with bymetadata
+            } else {
+                if (bymetadata == null) {
+                    if (language.equals(entry.url().language()))
+                        System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language);
+                    else {
+                        System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")");
+                        language = entry.url().language();
+                    }
+                } else {
+                    if (language.equals(bymetadata))
+                        System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
+                    else
+                        System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by metadata is " + bymetadata + ")");
+                }
+            }
        }
        
        // create a new loaded URL db entry
--- a/source/de/anomic/tools/iso639.java
+++ b/source/de/anomic/tools/iso639.java
@ -0,0 +1,197 @@
+// iso639.java
+// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 19.09.2008 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
+// $LastChangedRevision: 1986 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+// 
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package de.anomic.tools;
+
+import java.util.HashMap;
+
+public class iso639 {
+
+    static final String[] codes = {
+        "aa-Afar",
+        "ab-Abkhazian",
+        "af-Afrikaans",
+        "am-Amharic",
+        "ar-Arabic",
+        "as-Assamese",
+        "ay-Aymara",
+        "az-Azerbaijani",
+        "ba-Bashkir",
+        "be-Byelorussian",
+        "bg-Bulgarian",
+        "bh-Bihari",
+        "bi-Bislama",
+        "bn-Bengali;-Bangla",
+        "bo-Tibetan",
+        "br-Breton",
+        "ca-Catalan",
+        "co-Corsican",
+        "cs-Czech",
+        "cy-Welsh",
+        "da-Danish",
+        "de-German",
+        "dz-Bhutani",
+        "el-Greek",
+        "en-English",
+        "eo-Esperanto",
+        "es-Spanish",
+        "et-Estonian",
+        "eu-Basque",
+        "fa-Persian",
+        "fi-Finnish",
+        "fj-Fiji",
+        "fo-Faeroese",
+        "fr-French",
+        "fy-Frisian",
+        "ga-Irish",
+        "gd-Scots-Gaelic",
+        "gl-Galician",
+        "gn-Guarani",
+        "gu-Gujarati",
+        "ha-Hausa",
+        "hi-Hindi",
+        "hr-Croatian",
+        "hu-Hungarian",
+        "hy-Armenian",
+        "ia-Interlingua",
+        "ie-Interlingue",
+        "ik-Inupiak",
+        "in-Indonesian",
+        "is-Icelandic",
+        "it-Italian",
+        "iw-Hebrew",
+        "ja-Japanese",
+        "ji-Yiddish",
+        "jw-Javanese",
+        "ka-Georgian",
+        "kk-Kazakh",
+        "kl-Greenlandic",
+        "km-Cambodian",
+        "kn-Kannada",
+        "ko-Korean",
+        "ks-Kashmiri",
+        "ku-Kurdish",
+        "ky-Kirghiz",
+        "la-Latin",
+        "ln-Lingala",
+        "lo-Laothian",
+        "lt-Lithuanian",
+        "lv-Latvian,-Lettish",
+        "mg-Malagasy",
+        "mi-Maori",
+        "mk-Macedonian",
+        "ml-Malayalam",
+        "mn-Mongolian",
+        "mo-Moldavian",
+        "mr-Marathi",
+        "ms-Malay",
+        "mt-Maltese",
+        "my-Burmese",
+        "na-Nauru",
+        "ne-Nepali",
+        "nl-Dutch",
+        "no-Norwegian",
+        "oc-Occitan",
+        "om-(Afan)-Oromo",
+        "or-Oriya",
+        "pa-Punjabi",
+        "pl-Polish",
+        "ps-Pashto,-Pushto",
+        "pt-Portuguese",
+        "qu-Quechua",
+        "rm-Rhaeto-Romance",
+        "rn-Kirundi",
+        "ro-Romanian",
+        "ru-Russian",
+        "rw-Kinyarwanda",
+        "sa-Sanskrit",
+        "sd-Sindhi",
+        "sg-Sangro",
+        "sh-Serbo-Croatian",
+        "si-Singhalese",
+        "sk-Slovak",
+        "sl-Slovenian",
+        "sm-Samoan",
+        "sn-Shona",
+        "so-Somali",
+        "sq-Albanian",
+        "sr-Serbian",
+        "ss-Siswati",
+        "st-Sesotho",
+        "su-Sundanese",
+        "sv-Swedish",
+        "sw-Swahili",
+        "ta-Tamil",
+        "te-Tegulu",
+        "tg-Tajik",
+        "th-Thai",
+        "ti-Tigrinya",
+        "tk-Turkmen",
+        "tl-Tagalog",
+        "tn-Setswana",
+        "to-Tonga",
+        "tr-Turkish",
+        "ts-Tsonga",
+        "tt-Tatar",
+        "tw-Twi",
+        "uk-Ukrainian",
+        "ur-Urdu",
+        "uz-Uzbek",
+        "vi-Vietnamese",
+        "vo-Volapuk",
+        "wo-Wolof",
+        "xh-Xhosa",
+        "yo-Yoruba",
+        "zh-Chinese",
+        "zu-Zulu"};
+
+    static HashMap<String, String> mapping = new HashMap<String, String>();
+
+    static {
+        for (int i = 0; i < codes.length; i++) {
+            mapping.put(codes[i].substring(0, 2), codes[i].substring(3));
+        }
+    }
+    
+    /**
+     * get the name of the alpha-2 country code
+     * @param code, the mnemonic of the country in alpha-2
+     * @return the name of the country
+     */
+    public static final String country(String code) {
+        return mapping.get(code.toLowerCase());
+    }
+    
+    /**
+     * see if the given country in alpha-2 country code exists
+     * @param code, the mnemonic of the country in alpha-2
+     * @return true if the code exists
+     */
+    public static final boolean exists(String code) {
+        return mapping.containsKey(code.toLowerCase());
+    }
+    
+}
--- a/source/de/anomic/yacy/yacyURL.java
+++ b/source/de/anomic/yacy/yacyURL.java
@ -848,7 +848,7 @@ public class yacyURL implements Serializable {
    
    // language calculation
    public String language() {
-        String language = "uk";
+        String language = "en";
        final int pos = host.lastIndexOf(".");
        if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase();
        return language;