// ISO639.java // (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 19.09.2008 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.kelondro.util; import java.util.Locale; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; /** * Support for ISO 639 language codes. * @see Wikipedia list of ISO 639-1 codes * @see Language Code List from the ISO 639-2 Registration Authority (Library of Congress) * @see Home page of the ISO 639-3 Registration Authority (SIL International) * @see IANA language subtag registry * @see Code Changes history from the ISO 639-2 Registration Authority */ public class ISO639 { /* * Note : using icu4j package classes such as com.ibm.icu.impl.LocaleIDs may be * considered to maintain a more up to date support of ISO 639 codes, notably to * support ISO 639 3 letters language codes. */ /** ISO 639-1 language codes table : [two letters code] - [ISO Reference name] */ private static final String[] codes = { "aa-Afar", "ab-Abkhazian", "ae-Avestan", "af-Afrikaans", "ak-Akan", "am-Amharic", "an-Aragonese", "ar-Arabic", "as-Assamese", "av-Avaric", "ay-Aymara", "az-Azerbaijani", "ba-Bashkir", "be-Belarusian", "bg-Bulgarian", "bh-Bihari", // collective language code for bho-Bhojpuri, mag-Magahi, and mai-Maithili "bi-Bislama", "bm-Bambara", "bn-Bengali", "bo-Tibetan", "br-Breton", "bs-Bosnian", "ca-Catalan", "ce-Chechen", "ch-Chamorro", "co-Corsican", "cr-Cree", "cs-Czech", "cu-Church Slavic", "cv-Chuvash", "cy-Welsh", "da-Danish", "de-German", "dv-Dhivehi", "dz-Dzongkha", "ee-Ewe", "el-Modern Greek (1453-)", "en-English", "eo-Esperanto", "es-Spanish", "et-Estonian", "eu-Basque", "fa-Persian", "ff-Fulah", "fi-Finnish", "fj-Fijian", "fo-Faroese", "fr-French", "fy-Western Frisian", "ga-Irish", "gd-Scottish Gaelic", "gl-Galician", "gn-Guarani", "gu-Gujarati", "gv-Manx", "ha-Hausa", "he-Hebrew", "hi-Hindi", "ho-Hiri Motu", "hr-Croatian", "ht-Haitian", "hu-Hungarian", "hy-Armenian", "hz-Herero", "ia-Interlingua", "id-Indonesian", "ie-Interlingue", "ig-Igbo", "ii-Sichuan Yi", "ik-Inupiaq", "in-Indonesian", // deprecated on 1989-03-11 in favor of id-Indonesian "io-Ido", "is-Icelandic", "it-Italian", "iu-Inuktitut", "iw-Hebrew", // deprecated on 1989-03-11 in favor of he-Hebrew "ja-Japanese", "ji-Yiddish", // deprecated on 1989-03-11 in favor of yi-Yiddish "jv-Javanese", "ka-Georgian", "kg-Kongo", "ki-Kikuyu", "kj-Kuanyama", "kk-Kazakh", "kl-Kalaallisut; Greenlandic", "km-Central Khmer", "kn-Kannada", "ko-Korean", "kr-Kanuri", "ks-Kashmiri", "ku-Kurdish", "kv-Komi", "kw-Cornish", "ky-Kirghiz", "la-Latin", "lb-Luxembourgish", "lg-Ganda", "li-Limburgan", "ln-Lingala", "lo-Lao", "lt-Lithuanian", "lu-Luba-Katanga", "lv-Latvian", "mg-Malagasy", "mh-Marshallese", "mi-Maori", "mk-Macedonian", "ml-Malayalam", "mn-Mongolian", //"mo-Moldavian", // this maps on 'mozilla' :( // deprecated on 2008-11-03 in favor of ro-Romanian to be used for the variant of the Romanian language also known as Moldavian "mr-Marathi", "ms-Malay", "mt-Maltese", "my-Burmese", "na-Nauru", "nb-Norwegian Bokmål", "nd-North Ndebele", "ne-Nepali", "ng-Ndonga", "nl-Dutch", "nn-Norwegian Nynorsk", "no-Norwegian", "nr-South Ndebele", "nv-Navajo", "ny-Nyanja", "oc-Occitan (post 1500)", "oj-Ojibwa", "om-Oromo", "or-Oriya", "os-Ossetian", "pa-Panjabi; Punjabi", "pi-Pali", "pl-Polish", "ps-Pushto; Pashto", "pt-Portuguese", "qu-Quechua", "rm-Romansh", "rn-Rundi", "ro-Romanian", "ru-Russian", "rw-Kinyarwanda", "sa-Sanskrit", "sc-Sardinian", "sd-Sindhi", "se-Northern Sami", "sg-Sango", "sh-Serbo-Croatian", "si-Sinhala; Sinhalese", "sk-Slovak", "sl-Slovenian", "sm-Samoan", "sn-Shona", "so-Somali", "sq-Albanian", "sr-Serbian", "ss-Swati", "st-Southern Sotho", "su-Sundanese", "sv-Swedish", "sw-Swahili", "ta-Tamil", "te-Telugu", "tg-Tajik", "th-Thai", "ti-Tigrinya", "tk-Turkmen", "tl-Tagalog", "tn-Tswana", "to-Tonga (Tonga Islands)", "tr-Turkish", "ts-Tsonga", "tt-Tatar", "tw-Twi", "ty-Tahitian", "ug-Uighur", "uk-Ukrainian", "ur-Urdu", "uz-Uzbek", "ve-Venda", "vi-Vietnamese", "vo-Volapük", "wa-Walloon", "wo-Wolof", "xh-Xhosa", "yi-Yiddish", "yo-Yoruba", "za-Zhuang", "zh-Chinese", "zu-Zulu"}; /** Mapping from 2 letters ISO 639-1 code to ISO language reference name in English. */ private static Map mapping = new ConcurrentHashMap(codes.length); static { for (int i = 0; i < codes.length; i++) { mapping.put(codes[i].substring(0, 2), codes[i].substring(3)); } } /** * get the name of the alpha-2 country code * @param code, the mnemonic of the country in alpha-2 * @return the name of the country */ public static final String country(String code) { return mapping.get(code.toLowerCase(Locale.ROOT)); } /** * Check if the given country in alpha-2 country code is supported. * @param code, the mnemonic of the country in alpha-2 (ISO 639-1) * @return true if the code is not null and is known by this YaCy server */ public static final boolean exists(String code) { if(code == null) { return false; } return mapping.containsKey(code.toLowerCase(Locale.ROOT)); } /** * analyse a user-agent string and return language as given in the agent string * @param userAgent string * @return the language code if it is possible to parse the string and find a language code or null if not */ public static final String userAgentLanguageDetection(String userAgent) { if (userAgent == null || userAgent.length() < 2) return null; userAgent = userAgent.toLowerCase(Locale.ROOT); if (mapping.containsKey(userAgent.substring(0, 2))) return userAgent.substring(0, 2); if (userAgent.length() == 2 && mapping.containsKey(userAgent)) return userAgent; if (userAgent.length() == 5 && mapping.containsKey(userAgent.substring(0, 2))) return userAgent.substring(0, 2); int p = 2; // search for entries like ' en-' while (p < userAgent.length() - 1 && (p = userAgent.indexOf('-', p)) > 2) { if (userAgent.charAt(p - 3) == ' ' && mapping.containsKey(userAgent.substring(p - 2, p))) return userAgent.substring(p - 2, p); p++; } // search for entries like ' en;' p = 1; while (p < userAgent.length() - 1 && (p = userAgent.indexOf(';', p)) > 2) { if (userAgent.charAt(p - 3) == ' ' && mapping.containsKey(userAgent.substring(p - 2, p))) return userAgent.substring(p - 2, p); p++; } return null; } }