// ISO639.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 19.09.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.kelondro.util ;
import java.util.Locale ;
import java.util.Map ;
import java.util.concurrent.ConcurrentHashMap ;
/ * *
* Support for ISO 639 language codes .
* @see < a href = "https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes" > Wikipedia list of ISO 639 - 1 codes < / a >
* @see < a href = "http://www.loc.gov/standards/iso639-2/php/code_list.php" > Language Code List from the ISO 639 - 2 Registration Authority ( Library of Congress ) < / a >
* @see < a href = "http://www-01.sil.org/iso639-3/" > Home page of the ISO 639 - 3 Registration Authority ( SIL International ) < / a >
* @see < a href = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry" > IANA language subtag registry < / a >
* @see < a href = "http://www.loc.gov/standards/iso639-2/php/code_changes.php" > Code Changes history from the ISO 639 - 2 Registration Authority < / a >
* /
public class ISO639 {
/ *
* Note : using icu4j package classes such as com . ibm . icu . impl . LocaleIDs may be
* considered to maintain a more up to date support of ISO 639 codes , notably to
* support ISO 639 3 letters language codes .
* /
/** ISO 639-1 language codes table : [two letters code] - [ISO Reference name] */
private static final String [ ] codes = {
"aa-Afar" ,
"ab-Abkhazian" ,
"ae-Avestan" ,
"af-Afrikaans" ,
"ak-Akan" ,
"am-Amharic" ,
"an-Aragonese" ,
"ar-Arabic" ,
"as-Assamese" ,
"av-Avaric" ,
"ay-Aymara" ,
"az-Azerbaijani" ,
"ba-Bashkir" ,
"be-Belarusian" ,
"bg-Bulgarian" ,
"bh-Bihari" , // collective language code for bho-Bhojpuri, mag-Magahi, and mai-Maithili
"bi-Bislama" ,
"bm-Bambara" ,
"bn-Bengali" ,
"bo-Tibetan" ,
"br-Breton" ,
"bs-Bosnian" ,
"ca-Catalan" ,
"ce-Chechen" ,
"ch-Chamorro" ,
"co-Corsican" ,
"cr-Cree" ,
"cs-Czech" ,
"cu-Church Slavic" ,
"cv-Chuvash" ,
"cy-Welsh" ,
"da-Danish" ,
"de-German" ,
"dv-Dhivehi" ,
"dz-Dzongkha" ,
"ee-Ewe" ,
"el-Modern Greek (1453-)" ,
"en-English" ,
"eo-Esperanto" ,
"es-Spanish" ,
"et-Estonian" ,
"eu-Basque" ,
"fa-Persian" ,
"ff-Fulah" ,
"fi-Finnish" ,
"fj-Fijian" ,
"fo-Faroese" ,
"fr-French" ,
"fy-Western Frisian" ,
"ga-Irish" ,
"gd-Scottish Gaelic" ,
"gl-Galician" ,
"gn-Guarani" ,
"gu-Gujarati" ,
"gv-Manx" ,
"ha-Hausa" ,
"he-Hebrew" ,
"hi-Hindi" ,
"ho-Hiri Motu" ,
"hr-Croatian" ,
"ht-Haitian" ,
"hu-Hungarian" ,
"hy-Armenian" ,
"hz-Herero" ,
"ia-Interlingua" ,
"id-Indonesian" ,
"ie-Interlingue" ,
"ig-Igbo" ,
"ii-Sichuan Yi" ,
"ik-Inupiaq" ,
"in-Indonesian" , // deprecated on 1989-03-11 in favor of id-Indonesian
"io-Ido" ,
"is-Icelandic" ,
"it-Italian" ,
"iu-Inuktitut" ,
"iw-Hebrew" , // deprecated on 1989-03-11 in favor of he-Hebrew
"ja-Japanese" ,
"ji-Yiddish" , // deprecated on 1989-03-11 in favor of yi-Yiddish
"jv-Javanese" ,
"ka-Georgian" ,
"kg-Kongo" ,
"ki-Kikuyu" ,
"kj-Kuanyama" ,
"kk-Kazakh" ,
"kl-Kalaallisut; Greenlandic" ,
"km-Central Khmer" ,
"kn-Kannada" ,
"ko-Korean" ,
"kr-Kanuri" ,
"ks-Kashmiri" ,
"ku-Kurdish" ,
"kv-Komi" ,
"kw-Cornish" ,
"ky-Kirghiz" ,
"la-Latin" ,
"lb-Luxembourgish" ,
"lg-Ganda" ,
"li-Limburgan" ,
"ln-Lingala" ,
"lo-Lao" ,
"lt-Lithuanian" ,
"lu-Luba-Katanga" ,
"lv-Latvian" ,
"mg-Malagasy" ,
"mh-Marshallese" ,
"mi-Maori" ,
"mk-Macedonian" ,
"ml-Malayalam" ,
"mn-Mongolian" ,
//"mo-Moldavian", // this maps on 'mozilla' :( // deprecated on 2008-11-03 in favor of ro-Romanian to be used for the variant of the Romanian language also known as Moldavian
"mr-Marathi" ,
"ms-Malay" ,
"mt-Maltese" ,
"my-Burmese" ,
"na-Nauru" ,
"nb-Norwegian Bokmål" ,
"nd-North Ndebele" ,
"ne-Nepali" ,
"ng-Ndonga" ,
"nl-Dutch" ,
"nn-Norwegian Nynorsk" ,
"no-Norwegian" ,
"nr-South Ndebele" ,
"nv-Navajo" ,
"ny-Nyanja" ,
"oc-Occitan (post 1500)" ,
"oj-Ojibwa" ,
"om-Oromo" ,
"or-Oriya" ,
"os-Ossetian" ,
"pa-Panjabi; Punjabi" ,
"pi-Pali" ,
"pl-Polish" ,
"ps-Pushto; Pashto" ,
"pt-Portuguese" ,
"qu-Quechua" ,
"rm-Romansh" ,
"rn-Rundi" ,
"ro-Romanian" ,
"ru-Russian" ,
"rw-Kinyarwanda" ,
"sa-Sanskrit" ,
"sc-Sardinian" ,
"sd-Sindhi" ,
"se-Northern Sami" ,
"sg-Sango" ,
"sh-Serbo-Croatian" ,
"si-Sinhala; Sinhalese" ,
"sk-Slovak" ,
"sl-Slovenian" ,
"sm-Samoan" ,
"sn-Shona" ,
"so-Somali" ,
"sq-Albanian" ,
"sr-Serbian" ,
"ss-Swati" ,
"st-Southern Sotho" ,
"su-Sundanese" ,
"sv-Swedish" ,
"sw-Swahili" ,
"ta-Tamil" ,
"te-Telugu" ,
"tg-Tajik" ,
"th-Thai" ,
"ti-Tigrinya" ,
"tk-Turkmen" ,
"tl-Tagalog" ,
"tn-Tswana" ,
"to-Tonga (Tonga Islands)" ,
"tr-Turkish" ,
"ts-Tsonga" ,
"tt-Tatar" ,
"tw-Twi" ,
"ty-Tahitian" ,
"ug-Uighur" ,
"uk-Ukrainian" ,
"ur-Urdu" ,
"uz-Uzbek" ,
"ve-Venda" ,
"vi-Vietnamese" ,
"vo-Volapük" ,
"wa-Walloon" ,
"wo-Wolof" ,
"xh-Xhosa" ,
"yi-Yiddish" ,
"yo-Yoruba" ,
"za-Zhuang" ,
"zh-Chinese" ,
"zu-Zulu" } ;
/** Mapping from 2 letters ISO 639-1 code to ISO language reference name in English. */
private static Map < String , String > mapping = new ConcurrentHashMap < String , String > ( codes . length ) ;
static {
for ( int i = 0 ; i < codes . length ; i + + ) {
mapping . put ( codes [ i ] . substring ( 0 , 2 ) , codes [ i ] . substring ( 3 ) ) ;
}
}
/ * *
* get the name of the alpha - 2 country code
* @param code , the mnemonic of the country in alpha - 2
* @return the name of the country
* /
public static final String country ( String code ) {
return mapping . get ( code . toLowerCase ( Locale . ROOT ) ) ;
}
/ * *
* Check if the given country in alpha - 2 country code is supported .
* @param code , the mnemonic of the country in alpha - 2 ( ISO 639 - 1 )
* @return true if the code is not null and is known by this YaCy server
* /
public static final boolean exists ( String code ) {
if ( code = = null ) {
return false ;
}
return mapping . containsKey ( code . toLowerCase ( Locale . ROOT ) ) ;
}
/ * *
* analyse a user - agent string and return language as given in the agent string
* @param userAgent string
* @return the language code if it is possible to parse the string and find a language code or null if not
* /
public static final String userAgentLanguageDetection ( String userAgent ) {
if ( userAgent = = null | | userAgent . length ( ) < 2 ) return null ;
userAgent = userAgent . toLowerCase ( Locale . ROOT ) ;
if ( mapping . containsKey ( userAgent . substring ( 0 , 2 ) ) ) return userAgent . substring ( 0 , 2 ) ;
if ( userAgent . length ( ) = = 2 & & mapping . containsKey ( userAgent ) ) return userAgent ;
if ( userAgent . length ( ) = = 5 & & mapping . containsKey ( userAgent . substring ( 0 , 2 ) ) ) return userAgent . substring ( 0 , 2 ) ;
int p = 2 ;
// search for entries like ' en-'
while ( p < userAgent . length ( ) - 1 & & ( p = userAgent . indexOf ( '-' , p ) ) > 2 ) {
if ( userAgent . charAt ( p - 3 ) = = ' ' & & mapping . containsKey ( userAgent . substring ( p - 2 , p ) ) ) return userAgent . substring ( p - 2 , p ) ;
p + + ;
}
// search for entries like ' en;'
p = 1 ;
while ( p < userAgent . length ( ) - 1 & & ( p = userAgent . indexOf ( ';' , p ) ) > 2 ) {
if ( userAgent . charAt ( p - 3 ) = = ' ' & & mapping . containsKey ( userAgent . substring ( p - 2 , p ) ) ) return userAgent . substring ( p - 2 , p ) ;
p + + ;
}
return null ;
}
}