completed TLD categorization

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3515 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 146f4aee01
commit 242c19b480

@ -47,15 +47,22 @@ public class plasmaURL {
// day formatter for entry export // day formatter for entry export
public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd"); public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");
// TLD separation in political and cultural parts
// https://www.cia.gov/cia/publications/factbook/index.html
// http://en.wikipedia.org/wiki/List_of_countries_by_continent
private static final String[] TLD_NorthAmericaOceania={ private static final String[] TLD_NorthAmericaOceania={
// primary english-speaking countries // primary english-speaking countries
// english-speaking countries from central america are also included // english-speaking countries from central america are also included
// includes also dutch and french colonies in the caribbean sea // includes also dutch and french colonies in the caribbean sea
// and US/English/Australian military bases in asia
"EDU=US Educational", "EDU=US Educational",
"GOV=US Government", "GOV=US Government",
"MIL=US Military", "MIL=US Military",
"NET=Network", "NET=Network",
"ORG=Non-Profit Organization", "ORG=Non-Profit Organization",
"AN=Netherlands Antilles",
"AS=American Samoa",
"AG=Antigua and Barbuda", "AG=Antigua and Barbuda",
"AI=Anguilla", "AI=Anguilla",
"AU=Australia", "AU=Australia",
@ -64,12 +71,41 @@ public class plasmaURL {
"BM=Bermuda", "BM=Bermuda",
"BS=Bahamas", "BS=Bahamas",
"CA=Canada", "CA=Canada",
"CC=Cocos (Keeling) Islands",
"CK=Cook Islands",
"CX=Christmas Island", // located in the Indian Ocean, but belongs to Australia
"DM=Dominica", "DM=Dominica",
"FM=Micronesia",
"FJ=Fiji",
"GD=Grenada", "GD=Grenada",
"GP=Guadeloupe", "GP=Guadeloupe",
"GS=South Georgia and the South Sandwich Islands", // south of south america, but administrated by british, has only a scientific base
"GU=Guam", // strategical US basis close to Japan
"HM=Heard and McDonald Islands", // uninhabited, sub-Antarctic island, owned by Australia
"HT=Haiti",
"IO=British Indian Ocean Territory", // UK-US naval support facility in the Indian Ocean
"KI=Kiribati", // 33 coral atolls in the pacific, formerly owned by UK
"KN=Saint Kitts and Nevis", // islands in the carribean see
"KY=Cayman Islands", "KY=Cayman Islands",
"LC=Saint Lucia",
"MH=Marshall Islands", // formerly US atomic bomb test site, now a key installation in the US missile defense network
"MP=Northern Mariana Islands", // US strategic location in the western Pacific Ocean
"NC=New Caledonia",
"NF=Norfolk Island",
"NR=Nauru", // independent UN island
"NU=Niue", // one of world's largest coral islands
"NZ=New Zealand (Aotearoa)", "NZ=New Zealand (Aotearoa)",
"PM=St. Pierre and Miquelon", "PG=Papua New Guinea",
"PN=Pitcairn", // overseas territory of the UK
"PR=Puerto Rico", // territory of the US with commonwealth status
"PW=Palau", // was once governed by Micronesia
"Sb=Solomon Islands",
"TC=Turks and Caicos Islands", // overseas territory of the UK
"TK=Tokelau", // group of three atolls in the South Pacific Ocean, british protectorat
"TO=Tonga",
"TT=Trinidad and Tobago",
"TV=Tuvalu", // nine coral atolls in the South Pacific Ocean; in 2000, Tuvalu leased its TLD ".tv" for $50 million over a 12-year period
"UM=US Minor Outlying Islands", // nine insular United States possessions in the Pacific Ocean and the Caribbean Sea
"US=United States", "US=United States",
"VC=Saint Vincent and the Grenadines", "VC=Saint Vincent and the Grenadines",
"VG=Virgin Islands (British)", "VG=Virgin Islands (British)",
@ -90,8 +126,9 @@ public class plasmaURL {
"CU=Cuba", "CU=Cuba",
"DO=Dominican Republic", "DO=Dominican Republic",
"EC=Ecuador", "EC=Ecuador",
"GF=French Guiana",
"FK=Falkland Islands (Malvinas)", "FK=Falkland Islands (Malvinas)",
"GF=French Guiana",
"GT=Guatemala",
"GY=Guyana", "GY=Guyana",
"HN=Honduras", "HN=Honduras",
"JM=Jamaica", "JM=Jamaica",
@ -115,6 +152,8 @@ public class plasmaURL {
"BA=Bosnia and Herzegovina", "BA=Bosnia and Herzegovina",
"BE=Belgium", "BE=Belgium",
"BG=Bulgaria", "BG=Bulgaria",
"BV=Bouvet Island", // this island is uninhabited and covered by ice, south of africa but governed by Norway
"BY=Belarus",
"CH=Switzerland", "CH=Switzerland",
"CS=Czechoslovakia (former)", "CS=Czechoslovakia (former)",
"CZ=Czech Republic", "CZ=Czech Republic",
@ -124,6 +163,7 @@ public class plasmaURL {
"ES=Spain", "ES=Spain",
"EE=Estonia", "EE=Estonia",
"FI=Finland", "FI=Finland",
"FO=Faroe Islands", // Viking Settlers
"FR=France", "FR=France",
"FX=France, Metropolitan", "FX=France, Metropolitan",
"GB=Great Britain (UK)", "GB=Great Britain (UK)",
@ -143,48 +183,83 @@ public class plasmaURL {
"MC=Monaco", "MC=Monaco",
"MK=Macedonia", "MK=Macedonia",
"MN=Mongolia", "MN=Mongolia",
"MS=Montserrat", // British island in the Caribbean Sea, almost not populated because of strong vulcanic activity
"MT=Malta", "MT=Malta",
"MQ=Martinique", // island in the eastern Caribbean Sea, overseas department of France
"NATO=Nato field", "NATO=Nato field",
"NL=Netherlands", "NL=Netherlands",
"NO=Norway", "NO=Norway",
"PF=French Polynesia", // French annexed Polynesian island in the South Pacific, French atomic bomb test site
"PL=Poland", "PL=Poland",
"PM=St. Pierre and Miquelon", // french-administrated colony close to canada, belongs to France
"PT=Portugal", "PT=Portugal",
"RO=Romania", "RO=Romania",
"RU=Russia", "RU=Russia",
"SE=Sweden", "SE=Sweden",
"SI=Slovenia", "SI=Slovenia",
"SJ=Svalbard and Jan Mayen Islands", // part of Norway
"SM=San Marino",
"SK=Slovak Republic", "SK=Slovak Republic",
"SU=USSR (former)", "SU=USSR (former)",
"TF=French Southern Territories", // islands in the arctic see, no inhabitants
"UK=United Kingdom", "UK=United Kingdom",
"UA=Ukraine",
"VA=Vatican City State (Holy See)", "VA=Vatican City State (Holy See)",
"YU=Yugoslavia" "YU=Yugoslavia"
}; };
private static final String[] TLD_MiddleEastWestAsia = { private static final String[] TLD_MiddleEastWestAsia = {
// states that are influenced by islamic culture and arabic language
// includes also eurasia states and those that had been part of the former USSR and close to southwest asia
"AE=United Arab Emirates", "AE=United Arab Emirates",
"AF=Afghanistan", "AF=Afghanistan",
"AM=Armenia",
"AZ=Azerbaijan", "AZ=Azerbaijan",
"BH=Bahrain", "BH=Bahrain",
"GE=Georgia",
"IL=Israel", "IL=Israel",
"IQ=Iraq", "IQ=Iraq",
"IR=Iran", "IR=Iran",
"JO=Jordan",
"KG=Kyrgyzstan",
"KZ=Kazakhstan",
"KW=Kuwait",
"LB=Lebanon",
"OM=Oman",
"QA=Qatar",
"SA=Saudi Arabia",
"SY=Syria",
"TJ=Tajikistan",
"TM=Turkmenistan",
"PK=Pakistan", "PK=Pakistan",
"TR=Turkey", "TR=Turkey",
"UZ=Uzbekistan",
"YE=Yemen" "YE=Yemen"
}; };
private static final String[] TLD_SouthEastAsia = { private static final String[] TLD_SouthEastAsia = {
"BD=Bangladesh", "BD=Bangladesh",
"BN=Brunei Darussalam",
"BT=Bhutan", "BT=Bhutan",
"CN=China", "CN=China",
"HK=Hong Kong", "HK=Hong Kong",
"ID=Indonesia", "ID=Indonesia",
"IN=India", "IN=India",
"LA=Laos",
"NP=Nepal", "NP=Nepal",
"JP=Japan", "JP=Japan",
"KH=Cambodia", "KH=Cambodia",
"KP=Korea (North)", "KP=Korea (North)",
"KR=Korea (South)", "KR=Korea (South)",
"LK=Sri Lanka", "LK=Sri Lanka",
"MY=Malaysia",
"MM=Myanmar", // formerly known as Burma
"MO=Macau", // Portuguese settlement, part of China, but has some autonomy
"MV=Maldives", // group of atolls in the Indian Ocean
"PH=Philippines",
"SG=Singapore", "SG=Singapore",
"TP=East Timor",
"TH=Thailand",
"TW=Taiwan",
"VN=Viet Nam" "VN=Viet Nam"
}; };
private static final String[] TLD_Africa = { private static final String[] TLD_Africa = {
@ -197,6 +272,8 @@ public class plasmaURL {
"CG=Congo", "CG=Congo",
"CI=Cote D'Ivoire (Ivory Coast)", "CI=Cote D'Ivoire (Ivory Coast)",
"CM=Cameroon", "CM=Cameroon",
"CV=Cape Verde",
"DJ=Djibouti",
"DZ=Algeria", "DZ=Algeria",
"EG=Egypt", "EG=Egypt",
"EH=Western Sahara", "EH=Western Sahara",
@ -206,7 +283,10 @@ public class plasmaURL {
"GH=Ghana", "GH=Ghana",
"GM=Gambia", "GM=Gambia",
"GN=Guinea", "GN=Guinea",
"GQ=Equatorial Guinea",
"GW=Guinea-Bissau",
"KE=Kenya", "KE=Kenya",
"KM=Comoros",
"LR=Liberia", "LR=Liberia",
"LS=Lesotho", "LS=Lesotho",
"LY=Libya", "LY=Libya",
@ -222,12 +302,15 @@ public class plasmaURL {
"NG=Nigeria", "NG=Nigeria",
"RE=Reunion", "RE=Reunion",
"RW=Rwanda", "RW=Rwanda",
"SC=Seychelles",
"SD=Sudan",
"SH=St. Helena", "SH=St. Helena",
"SL=Sierra Leone", "SL=Sierra Leone",
"SN=Senegal", "SN=Senegal",
"SO=Somalia", "SO=Somalia",
"ST=Sao Tome and Principe", "ST=Sao Tome and Principe",
"SZ=Swaziland", "SZ=Swaziland",
"TD=Chad",
"TG=Togo", "TG=Togo",
"TN=Tunisia", "TN=Tunisia",
"TZ=Tanzania", "TZ=Tanzania",
@ -253,85 +336,6 @@ public class plasmaURL {
"NT=Neutral Zone" "NT=Neutral Zone"
}; };
/*
http://www.odci.gov/cia/publications/factbook/
http://en.wikipedia.org/wiki/List_of_countries_by_continent
"AM=Armenia",
"AN=Netherlands Antilles",
"AS=American Samoa",
"BN=Brunei Darussalam",
"BV=Bouvet Island",
"BY=Belarus",
"CC=Cocos (Keeling) Islands",
"CK=Cook Islands",
"CV=Cape Verde",
"CX=Christmas Island",
"DJ=Djibouti",
"FJ=Fiji",
"FM=Micronesia",
"FO=Faroe Islands",
"GE=Georgia",
"GQ=Equatorial Guinea",
"GS=S. Georgia and S. Sandwich Isls.",
"GT=Guatemala",
"GU=Guam",
"GW=Guinea-Bissau",
"HM=Heard and McDonald Islands",
"HT=Haiti",
"IO=British Indian Ocean Territory",
"JO=Jordan",
"KG=Kyrgyzstan",
"KI=Kiribati",
"KM=Comoros",
"KN=Saint Kitts and Nevis",
"KW=Kuwait",
"KZ=Kazakhstan",
"LA=Laos",
"LB=Lebanon",
"LC=Saint Lucia",
"MH=Marshall Islands",
"MM=Myanmar",
"MO=Macau",
"MP=Northern Mariana Islands",
"MQ=Martinique",
"MS=Montserrat",
"MV=Maldives",
"MY=Malaysia",
"NC=New Caledonia",
"NF=Norfolk Island",
"NR=Nauru",
"NU=Niue",
"OM=Oman",
"PF=French Polynesia",
"PG=Papua New Guinea",
"PH=Philippines",
"PN=Pitcairn",
"PR=Puerto Rico",
"PW=Palau",
"QA=Qatar",
"SA=Saudi Arabia",
"Sb=Solomon Islands",
"SC=Seychelles",
"SD=Sudan",
"SJ=Svalbard and Jan Mayen Islands",
"SM=San Marino",
"SY=Syria",
"TC=Turks and Caicos Islands",
"TD=Chad",
"TF=French Southern Territories",
"TH=Thailand",
"TJ=Tajikistan",
"TK=Tokelau",
"TM=Turkmenistan",
"TO=Tonga",
"TP=East Timor",
"TT=Trinidad and Tobago",
"TV=Tuvalu",
"TW=Taiwan",
"UA=Ukraine",
"UM=US Minor Outlying Islands",
"UZ=Uzbekistan",
*/
/* /*
* TLDs: aero, biz, com, coop, edu, gov, info, int, mil, museum, name, net, * TLDs: aero, biz, com, coop, edu, gov, info, int, mil, museum, name, net,

Loading…
Cancel
Save