From 483e9a206618425b54286ff2ab63f696746d9e5b Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 30 Apr 2008 23:06:42 +0000 Subject: [PATCH] - shifted tld recognition methods from yacyURL to serverDomains - changed isLocal Property in such a way that it is possible to see if a domain is in the internet (and not intranet) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4751 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/yacy/user/sidebar_navigation.java | 26 +- .../de/anomic/plasma/plasmaSwitchboard.java | 6 +- source/de/anomic/server/serverDomains.java | 420 ++++++++++++++++-- source/de/anomic/yacy/yacyURL.java | 346 +-------------- 5 files changed, 397 insertions(+), 403 deletions(-) diff --git a/build.properties b/build.properties index 299b7c304..1cdd0e78a 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.581 +releaseVersion=0.582 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz diff --git a/htroot/yacy/user/sidebar_navigation.java b/htroot/yacy/user/sidebar_navigation.java index 3ede24b0e..2ffc96188 100644 --- a/htroot/yacy/user/sidebar_navigation.java +++ b/htroot/yacy/user/sidebar_navigation.java @@ -34,9 +34,9 @@ import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverDomains; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -import de.anomic.yacy.yacyURL; public class sidebar_navigation { @@ -132,18 +132,18 @@ public class sidebar_navigation { final int[] zones = theSearch.getRankingResult().zones(); boolean z = false; domzone(prop, "All", theSearch.getRankingResult().size(), theQuery); - if (zones[yacyURL.TLD_EuropeRussia_ID] > 0) - { z = true; domzone(prop, "EuropeRussia", zones[yacyURL.TLD_EuropeRussia_ID], theQuery);} - if (zones[yacyURL.TLD_MiddleSouthAmerica_ID] > 0) - { z = true; domzone(prop, "MiddleSouthAmerica", zones[yacyURL.TLD_MiddleSouthAmerica_ID], theQuery);} - if (zones[yacyURL.TLD_SouthEastAsia_ID] > 0) - { z = true; domzone(prop, "SouthEastAsia", zones[yacyURL.TLD_SouthEastAsia_ID], theQuery);} - if (zones[yacyURL.TLD_MiddleEastWestAsia_ID] > 0) - { z = true; domzone(prop, "MiddleEastWestAsia_", zones[yacyURL.TLD_MiddleEastWestAsia_ID], theQuery);} - if (zones[yacyURL.TLD_NorthAmericaOceania_ID] + zones[yacyURL.TLD_Generic_ID] > 0) - { z = true; domzone(prop, "NorthAmericaOceania", zones[yacyURL.TLD_NorthAmericaOceania_ID] + zones[yacyURL.TLD_Generic_ID], theQuery);} - if (zones[yacyURL.TLD_Africa_ID] > 0) - { z = true; domzone(prop, "Africa", zones[yacyURL.TLD_Africa_ID], theQuery);} + if (zones[serverDomains.TLD_EuropeRussia_ID] > 0) + { z = true; domzone(prop, "EuropeRussia", zones[serverDomains.TLD_EuropeRussia_ID], theQuery);} + if (zones[serverDomains.TLD_MiddleSouthAmerica_ID] > 0) + { z = true; domzone(prop, "MiddleSouthAmerica", zones[serverDomains.TLD_MiddleSouthAmerica_ID], theQuery);} + if (zones[serverDomains.TLD_SouthEastAsia_ID] > 0) + { z = true; domzone(prop, "SouthEastAsia", zones[serverDomains.TLD_SouthEastAsia_ID], theQuery);} + if (zones[serverDomains.TLD_MiddleEastWestAsia_ID] > 0) + { z = true; domzone(prop, "MiddleEastWestAsia_", zones[serverDomains.TLD_MiddleEastWestAsia_ID], theQuery);} + if (zones[serverDomains.TLD_NorthAmericaOceania_ID] + zones[serverDomains.TLD_Generic_ID] > 0) + { z = true; domzone(prop, "NorthAmericaOceania", zones[serverDomains.TLD_NorthAmericaOceania_ID] + zones[serverDomains.TLD_Generic_ID], theQuery);} + if (zones[serverDomains.TLD_Africa_ID] > 0) + { z = true; domzone(prop, "Africa", zones[serverDomains.TLD_Africa_ID], theQuery);} if (zones[7] > 0) { z = true; domzone(prop, "Intranet", zones[7], theQuery);} prop.put("navigation_languagezone", (z) ? "1" : "0"); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 73d7c8503..698106c4c 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -91,7 +91,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.lang.reflect.Constructor; -import java.net.InetAddress; import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.ArrayList; @@ -1385,6 +1384,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch nameCacheNoCachingList = Collections.synchronizedSet(new HashSet()); private static final long startTime = System.currentTimeMillis(); + private static final String[] TLD_NorthAmericaOceania={ + // primary english-speaking countries + // english-speaking countries from central america are also included + // includes also dutch and french colonies in the caribbean sea + // and US/English/Australian military bases in asia + "EDU=US Educational", + "GOV=US Government", + "MIL=US Military", + "NET=Network", + "ORG=Non-Profit Organization", + "AN=Netherlands Antilles", + "AS=American Samoa", + "AG=Antigua and Barbuda", + "AI=Anguilla", + "AU=Australia", + "BB=Barbados", + "BZ=Belize", + "BM=Bermuda", + "BS=Bahamas", + "CA=Canada", + "CC=Cocos (Keeling) Islands", + "CK=Cook Islands", + "CX=Christmas Island", // located in the Indian Ocean, but belongs to Australia + "DM=Dominica", + "FM=Micronesia", + "FJ=Fiji", + "GD=Grenada", + "GP=Guadeloupe", + "GS=South Georgia and the South Sandwich Islands", // south of south america, but administrated by british, has only a scientific base + "GU=Guam", // strategical US basis close to Japan + "HM=Heard and McDonald Islands", // uninhabited, sub-Antarctic island, owned by Australia + "HT=Haiti", + "IO=British Indian Ocean Territory", // UK-US naval support facility in the Indian Ocean + "KI=Kiribati", // 33 coral atolls in the pacific, formerly owned by UK + "KN=Saint Kitts and Nevis", // islands in the carribean see + "KY=Cayman Islands", + "LC=Saint Lucia", + "MH=Marshall Islands", // formerly US atomic bomb test site, now a key installation in the US missile defense network + "MP=Northern Mariana Islands", // US strategic location in the western Pacific Ocean + "NC=New Caledonia", + "NF=Norfolk Island", + "NR=Nauru", // independent UN island + "NU=Niue", // one of world's largest coral islands + "NZ=New Zealand (Aotearoa)", + "PG=Papua New Guinea", + "PN=Pitcairn", // overseas territory of the UK + "PR=Puerto Rico", // territory of the US with commonwealth status + "PW=Palau", // was once governed by Micronesia + "Sb=Solomon Islands", + "TC=Turks and Caicos Islands", // overseas territory of the UK + "TK=Tokelau", // group of three atolls in the South Pacific Ocean, british protectorat + "TO=Tonga", + "TT=Trinidad and Tobago", + "TV=Tuvalu", // nine coral atolls in the South Pacific Ocean; in 2000, Tuvalu leased its TLD ".tv" for $50 million over a 12-year period + "UM=US Minor Outlying Islands", // nine insular United States possessions in the Pacific Ocean and the Caribbean Sea + "US=United States", + "VC=Saint Vincent and the Grenadines", + "VG=Virgin Islands (British)", + "VI=Virgin Islands (U.S.)", + "VU=Vanuatu", + "WF=Wallis and Futuna Islands", + "WS=Samoa" + }; + private static final String[] TLD_MiddleSouthAmerica = { + // primary spanish and portugese-speaking + "AR=Argentina", + "AW=Aruba", + "BR=Brazil", + "BO=Bolivia", + "CL=Chile", + "CO=Colombia", + "CR=Costa Rica", + "CU=Cuba", + "DO=Dominican Republic", + "EC=Ecuador", + "FK=Falkland Islands (Malvinas)", + "GF=French Guiana", + "GT=Guatemala", + "GY=Guyana", + "HN=Honduras", + "JM=Jamaica", + "MX=Mexico", + "NI=Nicaragua", + "PA=Panama", + "PE=Peru", + "PY=Paraguay", + "SR=Suriname", + "SV=El Salvador", + "UY=Uruguay", + "VE=Venezuela" + }; + private static final String[] TLD_EuropeRussia = { + // includes also countries that are mainly french- dutch- speaking + // and culturally close to europe + "AD=Andorra", + "AL=Albania", + "AQ=Antarctica", + "AT=Austria", + "BA=Bosnia and Herzegovina", + "BE=Belgium", + "BG=Bulgaria", + "BV=Bouvet Island", // this island is uninhabited and covered by ice, south of africa but governed by Norway + "BY=Belarus", + "CH=Switzerland", + "CS=Czechoslovakia (former)", + "CZ=Czech Republic", + "CY=Cyprus", + "DE=Germany", + "DK=Denmark", + "ES=Spain", + "EE=Estonia", + "EU=Europe", + "FI=Finland", + "FO=Faroe Islands", // Viking Settlers + "FR=France", + "FX=France, Metropolitan", + "GB=Great Britain (UK)", + "GI=Gibraltar", + "GL=Greenland", + "GR=Greece", + "HR=Croatia (Hrvatska)", + "HU=Hungary", + "IE=Ireland", + "IS=Iceland", + "IT=Italy", + "LI=Liechtenstein", + "LT=Lithuania", + "LU=Luxembourg", + "LV=Latvia", + "MD=Moldova", + "MC=Monaco", + "MK=Macedonia", + "MN=Mongolia", + "MS=Montserrat", // British island in the Caribbean Sea, almost not populated because of strong vulcanic activity + "MT=Malta", + "MQ=Martinique", // island in the eastern Caribbean Sea, overseas department of France + "NATO=Nato field", + "NL=Netherlands", + "NO=Norway", + "PF=French Polynesia", // French annexed Polynesian island in the South Pacific, French atomic bomb test site + "PL=Poland", + "PM=St. Pierre and Miquelon", // french-administrated colony close to canada, belongs to France + "PT=Portugal", + "RO=Romania", + "RU=Russia", + "SE=Sweden", + "SI=Slovenia", + "SJ=Svalbard and Jan Mayen Islands", // part of Norway + "SM=San Marino", + "SK=Slovak Republic", + "SU=USSR (former)", + "TF=French Southern Territories", // islands in the arctic see, no inhabitants + "UK=United Kingdom", + "UA=Ukraine", + "VA=Vatican City State (Holy See)", + "YU=Yugoslavia" + }; + private static final String[] TLD_MiddleEastWestAsia = { + // states that are influenced by islamic culture and arabic language + // includes also eurasia states and those that had been part of the former USSR and close to southwest asia + "AE=United Arab Emirates", + "AF=Afghanistan", + "AM=Armenia", + "AZ=Azerbaijan", + "BH=Bahrain", + "GE=Georgia", + "IL=Israel", + "IQ=Iraq", + "IR=Iran", + "JO=Jordan", + "KG=Kyrgyzstan", + "KZ=Kazakhstan", + "KW=Kuwait", + "LB=Lebanon", + "OM=Oman", + "QA=Qatar", + "SA=Saudi Arabia", + "SY=Syria", + "TJ=Tajikistan", + "TM=Turkmenistan", + "PK=Pakistan", + "TR=Turkey", + "UZ=Uzbekistan", + "YE=Yemen" + }; + private static final String[] TLD_SouthEastAsia = { + "BD=Bangladesh", + "BN=Brunei Darussalam", + "BT=Bhutan", + "CN=China", + "HK=Hong Kong", + "ID=Indonesia", + "IN=India", + "LA=Laos", + "NP=Nepal", + "JP=Japan", + "KH=Cambodia", + "KP=Korea (North)", + "KR=Korea (South)", + "LK=Sri Lanka", + "MY=Malaysia", + "MM=Myanmar", // formerly known as Burma + "MO=Macau", // Portuguese settlement, part of China, but has some autonomy + "MV=Maldives", // group of atolls in the Indian Ocean + "PH=Philippines", + "SG=Singapore", + "TP=East Timor", + "TH=Thailand", + "TW=Taiwan", + "VN=Viet Nam" + }; + private static final String[] TLD_Africa = { + "AO=Angola", + "BF=Burkina Faso", + "BI=Burundi", + "BJ=Benin", + "BW=Botswana", + "CF=Central African Republic", + "CG=Congo", + "CI=Cote D'Ivoire (Ivory Coast)", + "CM=Cameroon", + "CV=Cape Verde", + "DJ=Djibouti", + "DZ=Algeria", + "EG=Egypt", + "EH=Western Sahara", + "ER=Eritrea", + "ET=Ethiopia", + "GA=Gabon", + "GH=Ghana", + "GM=Gambia", + "GN=Guinea", + "GQ=Equatorial Guinea", + "GW=Guinea-Bissau", + "KE=Kenya", + "KM=Comoros", + "LR=Liberia", + "LS=Lesotho", + "LY=Libya", + "MA=Morocco", + "MG=Madagascar", + "ML=Mali", + "MR=Mauritania", + "MU=Mauritius", + "MW=Malawi", + "MZ=Mozambique", + "NA=Namibia", + "NE=Niger", + "NG=Nigeria", + "RE=Reunion", + "RW=Rwanda", + "SC=Seychelles", + "SD=Sudan", + "SH=St. Helena", + "SL=Sierra Leone", + "SN=Senegal", + "SO=Somalia", + "ST=Sao Tome and Principe", + "SZ=Swaziland", + "TD=Chad", + "TG=Togo", + "TN=Tunisia", + "TZ=Tanzania", + "UG=Uganda", + "ZA=South Africa", + "ZM=Zambia", + "ZR=Zaire", + "ZW=Zimbabwe", + "YT=Mayotte" + }; + private static final String[] TLD_Generic = { + "COM=US Commercial", + "AERO=", + "BIZ=", + "COOP=", + "INFO=", + "MUSEUM=", + "NAME=", + "PRO=", + "ARPA=", + "INT=International", + "ARPA=Arpanet", + "NT=Neutral Zone" + }; + + private static HashMap TLDID = new HashMap(); + //private static HashMap TLDName = new HashMap(); + + private static void insertTLDProps(String[] TLDList, int id) { + int p; + String tld; + //String name; + Integer ID = new Integer(id); + for (int i = 0; i < TLDList.length; i++) { + p = TLDList[i].indexOf('='); + if (p > 0) { + tld = TLDList[i].substring(0, p).toLowerCase(); + //name = TLDList[i].substring(p + 1); + TLDID.put(tld, ID); + //TLDName.put(tld, name); + } + } + } + + // TLD separation, partly separated into language groups + // https://www.cia.gov/cia/publications/factbook/index.html + // http://en.wikipedia.org/wiki/List_of_countries_by_continent + public static final int TLD_EuropeRussia_ID = 0; // European languages but no english + public static final int TLD_MiddleSouthAmerica_ID = 1; // mainly spanish-speaking countries + public static final int TLD_SouthEastAsia_ID = 2; // asia + public static final int TLD_MiddleEastWestAsia_ID = 3; // middle east + public static final int TLD_NorthAmericaOceania_ID = 4; // english-speaking countries + public static final int TLD_Africa_ID = 5; // africa + public static final int TLD_Generic_ID = 6; // anything else, also raw ip numbers + + static { + // assign TLD-ids and names + insertTLDProps(TLD_EuropeRussia, TLD_EuropeRussia_ID); + insertTLDProps(TLD_MiddleSouthAmerica, TLD_MiddleSouthAmerica_ID); + insertTLDProps(TLD_SouthEastAsia, TLD_SouthEastAsia_ID); + insertTLDProps(TLD_MiddleEastWestAsia, TLD_MiddleEastWestAsia_ID); + insertTLDProps(TLD_NorthAmericaOceania, TLD_NorthAmericaOceania_ID); + insertTLDProps(TLD_Africa, TLD_Africa_ID); + insertTLDProps(TLD_Generic, TLD_Generic_ID); + // the id=7 is used to flag local addresses + } + /** * Converts the time to a non negative int * @@ -136,28 +464,6 @@ public class serverDomains { return null; } -// /** -// * Checks wether an hostname already is in the DNS-cache. -// * FIXME: This method should use dnsResolve, as the code is 90% identical? -// * -// * @param host Searched for hostname. -// * @return true, if the hostname already is in the cache. -// */ -// public static boolean dnsFetch(String host) { -// if ((nameCacheHit.get(host) != null) /*|| (nameCacheMiss.contains(host)) */) return false; -// try { -// String ip = InetAddress.getByName(host).getHostAddress(); -// if ((ip != null) && (!(ip.equals("127.0.0.1"))) && (!(ip.equals("localhost")))) { -// nameCacheHit.put(host, ip); -// return true; -// } -// return false; -// } catch (UnknownHostException e) { -// //nameCacheMiss.add(host); -// return false; -// } -// } - /** * Returns the number of entries in the nameCacheHit map * @@ -218,43 +524,65 @@ public class serverDomains { } } - public static boolean isLocal(String address) { + public static int getDomainID(String host) { + int p = host.lastIndexOf('.'); + String tld = ""; + if (p > 0) { + tld = host.substring(p + 1); + } + Integer i = TLDID.get(tld); + if (i == null) { + return (isLocal(host)) ? 7 : TLD_Generic_ID; + } else { + return i.intValue(); + } + } + + public static boolean isLocal(String host) { // attention! because this method does a dns resolve to look up an IP address, // the result may be very slow. Consider 100 milliseconds per access - assert (address != null); + assert (host != null); // check local ip addresses - if (address.equals("localhost") || address.startsWith("127") - || address.startsWith("192.168") - || address.startsWith("10.") - || address.startsWith("169.254") + if (host.equals("localhost") || host.startsWith("127") + || host.startsWith("192.168") + || host.startsWith("10.") + || host.startsWith("169.254") || // 172.16.0.0-172.31.255.255 (I think this is faster than a regex) - (address.startsWith("172.") && (address.startsWith("172.16.") - || address.startsWith("172.17.") - || address.startsWith("172.18.") - || address.startsWith("172.19.") - || address.startsWith("172.20.") - || address.startsWith("172.21.") - || address.startsWith("172.22.") - || address.startsWith("172.23.") - || address.startsWith("172.24.") - || address.startsWith("172.25.") - || address.startsWith("172.26.") - || address.startsWith("172.27.") - || address.startsWith("172.28.") - || address.startsWith("172.29.") - || address.startsWith("172.30.") - || address.startsWith("172.31.")))) + (host.startsWith("172.") && (host.startsWith("172.16.") + || host.startsWith("172.17.") + || host.startsWith("172.18.") + || host.startsWith("172.19.") + || host.startsWith("172.20.") + || host.startsWith("172.21.") + || host.startsWith("172.22.") + || host.startsWith("172.23.") + || host.startsWith("172.24.") + || host.startsWith("172.25.") + || host.startsWith("172.26.") + || host.startsWith("172.27.") + || host.startsWith("172.28.") + || host.startsWith("172.29.") + || host.startsWith("172.30.") + || host.startsWith("172.31.")))) return true; + + // check the tld list + int p = host.lastIndexOf('.'); + String tld = ""; + if (p > 0) { + tld = host.substring(p + 1); + } + if (TLDID.get(tld) == null) return true; // make a dns resolve if a hostname is given and check again - final InetAddress clientAddress = dnsResolve(address); + final InetAddress clientAddress = dnsResolve(host); if (clientAddress != null) { if ((clientAddress.isAnyLocalAddress()) || (clientAddress.isLoopbackAddress())) return true; - if (address.charAt(0) > '9') address = clientAddress.getHostAddress(); + if (host.charAt(0) > '9') host = clientAddress.getHostAddress(); } // finally check if there are other local IP adresses that are not in diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index 0d4f2c62a..380817ed4 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -29,7 +29,6 @@ package de.anomic.yacy; import java.io.File; import java.net.MalformedURLException; -import java.util.HashMap; import java.util.Iterator; import java.util.TreeSet; import java.util.regex.Matcher; @@ -42,341 +41,14 @@ import de.anomic.tools.Punycode; import de.anomic.tools.Punycode.PunycodeException; public class yacyURL { - - // TLD separation in political and cultural parts - // https://www.cia.gov/cia/publications/factbook/index.html - // http://en.wikipedia.org/wiki/List_of_countries_by_continent - public static final int TLD_EuropeRussia_ID = 0; // European languages but no english - public static final int TLD_MiddleSouthAmerica_ID = 1; // mainly spanish-speaking countries - public static final int TLD_SouthEastAsia_ID = 2; // asia - public static final int TLD_MiddleEastWestAsia_ID = 3; // middle east - public static final int TLD_NorthAmericaOceania_ID = 4; // english-speaking countries - public static final int TLD_Africa_ID = 5; // africa - public static final int TLD_Generic_ID = 6; // anything else, mixed languages, mainly english public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter - private static final String[] TLD_NorthAmericaOceania={ - // primary english-speaking countries - // english-speaking countries from central america are also included - // includes also dutch and french colonies in the caribbean sea - // and US/English/Australian military bases in asia - "EDU=US Educational", - "GOV=US Government", - "MIL=US Military", - "NET=Network", - "ORG=Non-Profit Organization", - "AN=Netherlands Antilles", - "AS=American Samoa", - "AG=Antigua and Barbuda", - "AI=Anguilla", - "AU=Australia", - "BB=Barbados", - "BZ=Belize", - "BM=Bermuda", - "BS=Bahamas", - "CA=Canada", - "CC=Cocos (Keeling) Islands", - "CK=Cook Islands", - "CX=Christmas Island", // located in the Indian Ocean, but belongs to Australia - "DM=Dominica", - "FM=Micronesia", - "FJ=Fiji", - "GD=Grenada", - "GP=Guadeloupe", - "GS=South Georgia and the South Sandwich Islands", // south of south america, but administrated by british, has only a scientific base - "GU=Guam", // strategical US basis close to Japan - "HM=Heard and McDonald Islands", // uninhabited, sub-Antarctic island, owned by Australia - "HT=Haiti", - "IO=British Indian Ocean Territory", // UK-US naval support facility in the Indian Ocean - "KI=Kiribati", // 33 coral atolls in the pacific, formerly owned by UK - "KN=Saint Kitts and Nevis", // islands in the carribean see - "KY=Cayman Islands", - "LC=Saint Lucia", - "MH=Marshall Islands", // formerly US atomic bomb test site, now a key installation in the US missile defense network - "MP=Northern Mariana Islands", // US strategic location in the western Pacific Ocean - "NC=New Caledonia", - "NF=Norfolk Island", - "NR=Nauru", // independent UN island - "NU=Niue", // one of world's largest coral islands - "NZ=New Zealand (Aotearoa)", - "PG=Papua New Guinea", - "PN=Pitcairn", // overseas territory of the UK - "PR=Puerto Rico", // territory of the US with commonwealth status - "PW=Palau", // was once governed by Micronesia - "Sb=Solomon Islands", - "TC=Turks and Caicos Islands", // overseas territory of the UK - "TK=Tokelau", // group of three atolls in the South Pacific Ocean, british protectorat - "TO=Tonga", - "TT=Trinidad and Tobago", - "TV=Tuvalu", // nine coral atolls in the South Pacific Ocean; in 2000, Tuvalu leased its TLD ".tv" for $50 million over a 12-year period - "UM=US Minor Outlying Islands", // nine insular United States possessions in the Pacific Ocean and the Caribbean Sea - "US=United States", - "VC=Saint Vincent and the Grenadines", - "VG=Virgin Islands (British)", - "VI=Virgin Islands (U.S.)", - "VU=Vanuatu", - "WF=Wallis and Futuna Islands", - "WS=Samoa" - }; - private static final String[] TLD_MiddleSouthAmerica = { - // primary spanish and portugese-speaking - "AR=Argentina", - "AW=Aruba", - "BR=Brazil", - "BO=Bolivia", - "CL=Chile", - "CO=Colombia", - "CR=Costa Rica", - "CU=Cuba", - "DO=Dominican Republic", - "EC=Ecuador", - "FK=Falkland Islands (Malvinas)", - "GF=French Guiana", - "GT=Guatemala", - "GY=Guyana", - "HN=Honduras", - "JM=Jamaica", - "MX=Mexico", - "NI=Nicaragua", - "PA=Panama", - "PE=Peru", - "PY=Paraguay", - "SR=Suriname", - "SV=El Salvador", - "UY=Uruguay", - "VE=Venezuela" - }; - private static final String[] TLD_EuropeRussia = { - // includes also countries that are mainly french- dutch- speaking - // and culturally close to europe - "AD=Andorra", - "AL=Albania", - "AQ=Antarctica", - "AT=Austria", - "BA=Bosnia and Herzegovina", - "BE=Belgium", - "BG=Bulgaria", - "BV=Bouvet Island", // this island is uninhabited and covered by ice, south of africa but governed by Norway - "BY=Belarus", - "CH=Switzerland", - "CS=Czechoslovakia (former)", - "CZ=Czech Republic", - "CY=Cyprus", - "DE=Germany", - "DK=Denmark", - "ES=Spain", - "EE=Estonia", - "EU=Europe", - "FI=Finland", - "FO=Faroe Islands", // Viking Settlers - "FR=France", - "FX=France, Metropolitan", - "GB=Great Britain (UK)", - "GI=Gibraltar", - "GL=Greenland", - "GR=Greece", - "HR=Croatia (Hrvatska)", - "HU=Hungary", - "IE=Ireland", - "IS=Iceland", - "IT=Italy", - "LI=Liechtenstein", - "LT=Lithuania", - "LU=Luxembourg", - "LV=Latvia", - "MD=Moldova", - "MC=Monaco", - "MK=Macedonia", - "MN=Mongolia", - "MS=Montserrat", // British island in the Caribbean Sea, almost not populated because of strong vulcanic activity - "MT=Malta", - "MQ=Martinique", // island in the eastern Caribbean Sea, overseas department of France - "NATO=Nato field", - "NL=Netherlands", - "NO=Norway", - "PF=French Polynesia", // French annexed Polynesian island in the South Pacific, French atomic bomb test site - "PL=Poland", - "PM=St. Pierre and Miquelon", // french-administrated colony close to canada, belongs to France - "PT=Portugal", - "RO=Romania", - "RU=Russia", - "SE=Sweden", - "SI=Slovenia", - "SJ=Svalbard and Jan Mayen Islands", // part of Norway - "SM=San Marino", - "SK=Slovak Republic", - "SU=USSR (former)", - "TF=French Southern Territories", // islands in the arctic see, no inhabitants - "UK=United Kingdom", - "UA=Ukraine", - "VA=Vatican City State (Holy See)", - "YU=Yugoslavia" - }; - - private static final String[] TLD_MiddleEastWestAsia = { - // states that are influenced by islamic culture and arabic language - // includes also eurasia states and those that had been part of the former USSR and close to southwest asia - "AE=United Arab Emirates", - "AF=Afghanistan", - "AM=Armenia", - "AZ=Azerbaijan", - "BH=Bahrain", - "GE=Georgia", - "IL=Israel", - "IQ=Iraq", - "IR=Iran", - "JO=Jordan", - "KG=Kyrgyzstan", - "KZ=Kazakhstan", - "KW=Kuwait", - "LB=Lebanon", - "OM=Oman", - "QA=Qatar", - "SA=Saudi Arabia", - "SY=Syria", - "TJ=Tajikistan", - "TM=Turkmenistan", - "PK=Pakistan", - "TR=Turkey", - "UZ=Uzbekistan", - "YE=Yemen" - }; - private static final String[] TLD_SouthEastAsia = { - "BD=Bangladesh", - "BN=Brunei Darussalam", - "BT=Bhutan", - "CN=China", - "HK=Hong Kong", - "ID=Indonesia", - "IN=India", - "LA=Laos", - "NP=Nepal", - "JP=Japan", - "KH=Cambodia", - "KP=Korea (North)", - "KR=Korea (South)", - "LK=Sri Lanka", - "MY=Malaysia", - "MM=Myanmar", // formerly known as Burma - "MO=Macau", // Portuguese settlement, part of China, but has some autonomy - "MV=Maldives", // group of atolls in the Indian Ocean - "PH=Philippines", - "SG=Singapore", - "TP=East Timor", - "TH=Thailand", - "TW=Taiwan", - "VN=Viet Nam" - }; - private static final String[] TLD_Africa = { - "AO=Angola", - "BF=Burkina Faso", - "BI=Burundi", - "BJ=Benin", - "BW=Botswana", - "CF=Central African Republic", - "CG=Congo", - "CI=Cote D'Ivoire (Ivory Coast)", - "CM=Cameroon", - "CV=Cape Verde", - "DJ=Djibouti", - "DZ=Algeria", - "EG=Egypt", - "EH=Western Sahara", - "ER=Eritrea", - "ET=Ethiopia", - "GA=Gabon", - "GH=Ghana", - "GM=Gambia", - "GN=Guinea", - "GQ=Equatorial Guinea", - "GW=Guinea-Bissau", - "KE=Kenya", - "KM=Comoros", - "LR=Liberia", - "LS=Lesotho", - "LY=Libya", - "MA=Morocco", - "MG=Madagascar", - "ML=Mali", - "MR=Mauritania", - "MU=Mauritius", - "MW=Malawi", - "MZ=Mozambique", - "NA=Namibia", - "NE=Niger", - "NG=Nigeria", - "RE=Reunion", - "RW=Rwanda", - "SC=Seychelles", - "SD=Sudan", - "SH=St. Helena", - "SL=Sierra Leone", - "SN=Senegal", - "SO=Somalia", - "ST=Sao Tome and Principe", - "SZ=Swaziland", - "TD=Chad", - "TG=Togo", - "TN=Tunisia", - "TZ=Tanzania", - "UG=Uganda", - "ZA=South Africa", - "ZM=Zambia", - "ZR=Zaire", - "ZW=Zimbabwe", - "YT=Mayotte" - }; - private static final String[] TLD_Generic = { - "COM=US Commercial", - "AERO=", - "BIZ=", - "COOP=", - "INFO=", - "MUSEUM=", - "NAME=", - "PRO=", - "ARPA=", - "INT=International", - "ARPA=Arpanet", - "NT=Neutral Zone" - }; - public static String dummyHash; - - private static HashMap TLDID = new HashMap(); - private static HashMap TLDName = new HashMap(); - - private static void insertTLDProps(String[] TLDList, int id) { - int p; - String tld, name; - Integer ID = new Integer(id); - for (int i = 0; i < TLDList.length; i++) { - p = TLDList[i].indexOf('='); - if (p > 0) { - tld = TLDList[i].substring(0, p).toLowerCase(); - name = TLDList[i].substring(p + 1); - TLDID.put(tld, ID); - TLDName.put(tld, name); - } - } - } - static { // create a dummy hash dummyHash = ""; for (int i = 0; i < yacySeedDB.commonHashLength; i++) dummyHash += "-"; - - // assign TLD-ids and names - insertTLDProps(TLD_EuropeRussia, TLD_EuropeRussia_ID); - insertTLDProps(TLD_MiddleSouthAmerica, TLD_MiddleSouthAmerica_ID); - insertTLDProps(TLD_SouthEastAsia, TLD_SouthEastAsia_ID); - insertTLDProps(TLD_MiddleEastWestAsia, TLD_MiddleEastWestAsia_ID); - insertTLDProps(TLD_NorthAmericaOceania, TLD_NorthAmericaOceania_ID); - insertTLDProps(TLD_Africa, TLD_Africa_ID); - insertTLDProps(TLD_Generic, TLD_Generic_ID); - // the id=7 is used to flag local addresses - } // class variables @@ -960,15 +632,10 @@ public class yacyURL { assert this.hash == null; // should only be called if the hash was not computed bevore - int p = this.host.lastIndexOf('.'); - String tld = "", dom = tld; - if (p > 0) { - tld = host.substring(p + 1); - dom = host.substring(0, p); - } - Integer ID = (serverDomains.isLocal(tld)) ? null : TLDID.get(tld); // identify local addresses - int id = (ID == null) ? 7 : ID.intValue(); // local addresses are flagged with id=7 + int id = serverDomains.getDomainID(this.host); // id=7: tld is local boolean isHTTP = this.protocol.equals("http"); + int p = this.host.lastIndexOf('.'); + String dom = (p > 0) ? dom = host.substring(0, p) : ""; p = dom.lastIndexOf('.'); // locate subdomain String subdom = ""; if (p > 0) { @@ -1081,10 +748,6 @@ public class yacyURL { return (kelondroBase64Order.enhancedCoder.decodeByte(urlHash.charAt(11)) & 28) >> 2; } - public static boolean isLocalDomain(String urlhash) { - return domDomain(urlhash) == 7; - } - public static boolean isDomDomain(String urlHash, int id) { return domDomain(urlHash) == id; } @@ -1096,7 +759,8 @@ public class yacyURL { // checks for local/global IP range and local IP public boolean isLocal() { - return serverDomains.isLocal(this.host); + if (this.hash == null) synchronized (this) {this.hash = urlHashComputation();} + return domDomain(this.hash) == 7; } // language calculation