diff --git a/build.properties b/build.properties index df24c3c9f..118f6041e 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.411 +releaseVersion=0.412 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 014c3360b..c777292cc 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -257,21 +257,30 @@ public final class plasmaSearchResult { } public static void main(String[] args) { - URL[] urls = new URL[6]; + URL[] urls = new URL[10]; try { urls[0] = new URL("http://www.yacy.net"); - urls[1] = new URL("http://www.yacy.net/"); - urls[2] = new URL("http://www.yacy.net/index.html"); - urls[3] = new URL("http://www.yacy.net/yacy"); - urls[4] = new URL("http://www.yacy.net/yacy/"); - urls[5] = new URL("http://www.yacy.net/yacy/index.html"); - String[] paths1 = new String[6]; for (int i = 0; i < 6; i++) { + urls[1] = new URL("http://www.yacy.de/"); + urls[2] = new URL("http://yacy.net/"); + urls[3] = new URL("http://www.yacy.net:80/"); + urls[4] = new URL("http://yacy.net:80/"); + urls[5] = new URL("http://www.yacy.net/index.html"); + urls[6] = new URL("http://www.yacy.net/yacy"); + urls[7] = new URL("http://www.yacy.net/yacy/"); + urls[8] = new URL("http://www.yacy.net/yacy/index.html"); + urls[9] = new URL("ftp://www.yacy.net/yacy/index.html"); + String hash, fill; + String[] paths1 = new String[urls.length]; for (int i = 0; i < urls.length; i++) { + fill = ""; for (int j = 0; j < 35 - urls[i].toString().length(); j++) fill +=" "; paths1[i] = urlPath(urls[i]); - System.out.println("paths1[" + i + "] = " + paths1[i]); + hash = plasmaURL.urlHash(urls[i]); + System.out.println("paths1[" + urls[i] + fill +"] = " + hash + ", typeID=" + plasmaURL.flagTypeID(hash) + ", tldID=" + plasmaURL.flagTLDID(hash) + ", lengthID=" + plasmaURL.flagLengthID(hash) + " / " + paths1[i]); } - String[] paths2 = new String[6]; for (int i = 0; i < 6; i++) { + String[] paths2 = new String[urls.length]; for (int i = 0; i < urls.length; i++) { + fill = ""; for (int j = 0; j < 35 - urls[i].toString().length(); j++) fill +=" "; paths2[i] = shortenPath(paths1[i]); - System.out.println("paths2[" + i + "] = " + paths2[i]); + hash = plasmaURL.urlHash(urls[i]); + System.out.println("paths2[" + urls[i] + fill + "] = " + hash + ", typeID=" + plasmaURL.flagTypeID(hash) + ", tldID=" + plasmaURL.flagTLDID(hash) + ", lengthID=" + plasmaURL.flagLengthID(hash) + " / " + paths2[i]); } } catch (MalformedURLException e) { e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaURL.java b/source/de/anomic/plasma/plasmaURL.java index 3c43b9dbd..39c615be1 100644 --- a/source/de/anomic/plasma/plasmaURL.java +++ b/source/de/anomic/plasma/plasmaURL.java @@ -43,8 +43,10 @@ package de.anomic.plasma; import java.io.IOException; import java.net.URL; +import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.HashSet; +import java.util.HashMap; import java.util.Iterator; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -79,17 +81,346 @@ public class plasmaURL { public static final int urlHostLength = 8; // the host as struncated name public static final int urlHandleLength = 4; // a handle + private static final String[] TLD_NorthAmericaOceania={ + // primary english-speaking countries + // english-speaking countries from central america are also included + // includes also dutch and french colonies in the caribbean sea + "EDU=US Educational", + "GOV=US Government", + "MIL=US Military", + "NET=Network", + "ORG=Non-Profit Organization", + "AG=Antigua and Barbuda", + "AI=Anguilla", + "AU=Australia", + "BB=Barbados", + "BZ=Belize", + "BM=Bermuda", + "BS=Bahamas", + "CA=Canada", + "DM=Dominica", + "GD=Grenada", + "GP=Guadeloupe", + "KY=Cayman Islands", + "NZ=New Zealand (Aotearoa)", + "PM=St. Pierre and Miquelon", + "US=United States", + "VC=Saint Vincent and the Grenadines", + "VG=Virgin Islands (British)", + "VI=Virgin Islands (U.S.)", + "VU=Vanuatu", + "WF=Wallis and Futuna Islands", + "WS=Samoa" + }; + private static final String[] TLD_MiddleSouthAmerica = { + // primary spanish and portugese-speaking + "AR=Argentina", + "AW=Aruba", + "BR=Brazil", + "BO=Bolivia", + "CL=Chile", + "CO=Colombia", + "CR=Costa Rica", + "CU=Cuba", + "DO=Dominican Republic", + "EC=Ecuador", + "GF=French Guiana", + "FK=Falkland Islands (Malvinas)", + "GY=Guyana", + "HN=Honduras", + "JM=Jamaica", + "MX=Mexico", + "NI=Nicaragua", + "PA=Panama", + "PE=Peru", + "PY=Paraguay", + "SR=Suriname", + "SV=El Salvador", + "UY=Uruguay", + "VE=Venezuela" + }; + private static final String[] TLD_EuropaRussia = { + // includes also countries that are mainly french- dutch- speaking + // and culturally close to europe + "AD=Andorra", + "AL=Albania", + "AT=Austria", + "BA=Bosnia and Herzegovina", + "BE=Belgium", + "BG=Bulgaria", + "CH=Switzerland", + "CS=Czechoslovakia (former)", + "CZ=Czech Republic", + "CY=Cyprus", + "DE=Germany", + "DK=Denmark", + "ES=Spain", + "EE=Estonia", + "FI=Finland", + "FR=France", + "FX=France, Metropolitan", + "GB=Great Britain (UK)", + "GI=Gibraltar", + "GL=Greenland", + "GR=Greece", + "HR=Croatia (Hrvatska)", + "HU=Hungary", + "IE=Ireland", + "IS=Iceland", + "IT=Italy", + "LI=Liechtenstein", + "LT=Lithuania", + "LU=Luxembourg", + "LV=Latvia", + "MD=Moldova", + "MC=Monaco", + "MK=Macedonia", + "MN=Mongolia", + "MT=Malta", + "NATO=Nato field", + "NL=Netherlands", + "NO=Norway", + "PL=Poland", + "PT=Portugal", + "RO=Romania", + "RU=Russia", + "SE=Sweden", + "SI=Slovenia", + "SK=Slovak Republic", + "SU=USSR (former)", + "UK=United Kingdom", + "VA=Vatican City State (Holy See)", + "YU=Yugoslavia" + }; + private static final String[] TLD_MiddleEastWestAsia = { + "AE=United Arab Emirates", + "AF=Afghanistan", + "AZ=Azerbaijan", + "BH=Bahrain", + "IL=Israel", + "IQ=Iraq", + "IR=Iran", + "PK=Pakistan", + "YE=Yemen" + }; + private static final String[] TLD_SouthEastAsia = { + "BD=Bangladesh", + "BT=Bhutan", + "CN=China", + "HK=Hong Kong", + "ID=Indonesia", + "IN=India", + "NP=Nepal", + "JP=Japan", + "KH=Cambodia", + "KP=Korea (North)", + "KR=Korea (South)", + "LK=Sri Lanka", + "SG=Singapore", + "VN=Viet Nam" + }; + private static final String[] TLD_Africa = { + "AO=Angola", + "BF=Burkina Faso", + "BI=Burundi", + "BJ=Benin", + "BW=Botswana", + "CF=Central African Republic", + "CG=Congo", + "CI=Cote D'Ivoire (Ivory Coast)", + "CM=Cameroon", + "DZ=Algeria", + "EG=Egypt", + "EH=Western Sahara", + "ER=Eritrea", + "ET=Ethiopia", + "GA=Gabon", + "GH=Ghana", + "GM=Gambia", + "GN=Guinea", + "KE=Kenya", + "LR=Liberia", + "LS=Lesotho", + "LY=Libya", + "MA=Morocco", + "MG=Madagascar", + "ML=Mali", + "MR=Mauritania", + "MU=Mauritius", + "MW=Malawi", + "MZ=Mozambique", + "NA=Namibia", + "NE=Niger", + "NG=Nigeria", + "RE=Reunion", + "RW=Rwanda", + "SH=St. Helena", + "SL=Sierra Leone", + "SN=Senegal", + "SO=Somalia", + "ST=Sao Tome and Principe", + "SZ=Swaziland", + "TG=Togo", + "TN=Tunisia", + "TZ=Tanzania", + "UG=Uganda", + "ZA=South Africa", + "ZM=Zambia", + "ZR=Zaire", + "ZW=Zimbabwe", + "YT=Mayotte" + }; + private static final String[] TLD_Generic = { + "COM=US Commercial", + "AERO=", + "BIZ=", + "COOP=", + "INFO=", + "MUSEUM=", + "NAME=", + "PRO=", + "ARPA=", + "INT=International", + "ARPA=Arpanet" + }; + private static final String[] TLD_Unassigned = { + "AQ=Antarctica", + "NT=Neutral Zone" + }; + + /* + http://www.odci.gov/cia/publications/factbook/ + http://en.wikipedia.org/wiki/List_of_countries_by_continent + "AM=Armenia", + "AN=Netherlands Antilles", + "AS=American Samoa", + "BN=Brunei Darussalam", + "BV=Bouvet Island", + "BY=Belarus", + "CC=Cocos (Keeling) Islands", + "CK=Cook Islands", + "CV=Cape Verde", + "CX=Christmas Island", + "DJ=Djibouti", + "FJ=Fiji", + "FM=Micronesia", + "FO=Faroe Islands", + "GE=Georgia", + "GQ=Equatorial Guinea", + "GS=S. Georgia and S. Sandwich Isls.", + "GT=Guatemala", + "GU=Guam", + "GW=Guinea-Bissau", + "HM=Heard and McDonald Islands", + "HT=Haiti", + "IO=British Indian Ocean Territory", + "JO=Jordan", + "KG=Kyrgyzstan", + "KI=Kiribati", + "KM=Comoros", + "KN=Saint Kitts and Nevis", + "KW=Kuwait", + "KZ=Kazakhstan", + "LA=Laos", + "LB=Lebanon", + "LC=Saint Lucia", + "MH=Marshall Islands", + "MM=Myanmar", + "MO=Macau", + "MP=Northern Mariana Islands", + "MQ=Martinique", + "MS=Montserrat", + "MV=Maldives", + "MY=Malaysia", + "NC=New Caledonia", + "NF=Norfolk Island", + "NR=Nauru", + "NU=Niue", + "OM=Oman", + "PF=French Polynesia", + "PG=Papua New Guinea", + "PH=Philippines", + "PN=Pitcairn", + "PR=Puerto Rico", + "PW=Palau", + "QA=Qatar", + "SA=Saudi Arabia", + "Sb=Solomon Islands", + "SC=Seychelles", + "SD=Sudan", + "SJ=Svalbard and Jan Mayen Islands", + "SM=San Marino", + "SY=Syria", + "TC=Turks and Caicos Islands", + "TD=Chad", + "TF=French Southern Territories", + "TH=Thailand", + "TJ=Tajikistan", + "TK=Tokelau", + "TM=Turkmenistan", + "TO=Tonga", + "TP=East Timor", + "TR=Turkey", + "TT=Trinidad and Tobago", + "TV=Tuvalu", + "TW=Taiwan", + "UA=Ukraine", + "UM=US Minor Outlying Islands", + "UZ=Uzbekistan", + */ - /* nw data fields to become valid after migration - * age of page at time of load + /* TLDs: + aero, biz, com, coop, edu, gov, info, int, mil, museum, name, net, org, pro, arpa + AC, AD, AE, AERO, AF, AG, AI, AL, AM, AN, AO, AQ, AR, ARPA, AS, AT, AU, AW, AZ, + BA, BB, BD, BE, BF, BG, BH, BI, BIZ, BJ, BM, BN, BO, BR, BS, BT, BV, BW, BY, BZ, + CA, CC, CD, CF, CG, CH, CI, CK, CL, CM, CN, CO, COM, COOP, CR, CU, CV, CX, CY, CZ, + DE, DJ, DK, DM, DO, DZ, EC, EDU, EE, EG, ER, ES, ET, EU, FI, FJ, FK, FM, FO, FR, + GA, GB, GD, GE, GF, GG, GH, GI, GL, GM, GN, GOV, GP, GQ, GR, GS, GT, GU, GW, GY, + HK, HM, HN, HR, HT, HU, ID, IE, IL, IM, IN, INFO, INT, IO, IQ, IR, IS, IT, + JE, JM, JO, JOBS, JP, KE, KG, KH, KI, KM, KN, KR, KW, KY, KZ, + LA, LB, LC, LI, LK, LR, LS, LT, LU, LV, LY, + MA, MC, MD, MG, MH, MIL, MK, ML, MM, MN, MO, MOBI, MP, MQ, MR, MS, MT, MU, MUSEUM, MV, MW, MX, MY, MZ, + NA, NAME, NC, NE, NET, NF, NG, NI, NL, NO, NP, NR, NU, NZ, OM, ORG, + PA, PE, PF, PG, PH, PK, PL, PM, PN, PR, PRO, PS, PT, PW, PY, QA, RE, RO, RU, RW, + SA, SB, SC, SD, SE, SG, SH, SI, SJ, SK, SL, SM, SN, SO, SR, ST, SU, SV, SY, SZ, + TC, TD, TF, TG, TH, TJ, TK, TL, TM, TN, TO, TP, TR, TRAVEL, TT, TV, TW, TZ, + UA, UG, UK, UM, US, UY, UZ, VA, VC, VE, VG, VI, VN, VU, WF, WS, YE, YT, YU, ZA, ZM, ZW */ public static String dummyHash; + private static HashMap TLDID = new HashMap(); + private static HashMap TLDName = new HashMap(); + private static void insertTLDProps(String[] TLDList, int id) { + int p; + String tld, name; + Integer ID = new Integer(id); + for (int i = 0; i < TLDList.length; i++) { + p = TLDList[i].indexOf('='); + if (p > 0) { + tld = TLDList[i].substring(0, p).toLowerCase(); + name = TLDList[i].substring(p + 1); + TLDID.put(tld, ID); + TLDName.put(tld, name); + } + } + } static { + // create a dummy hash dummyHash = ""; for (int i = 0; i < urlHashLength; i++) dummyHash += "-"; + + // assign TLD-ids and names + insertTLDProps(TLD_EuropaRussia, 0); + insertTLDProps(TLD_MiddleSouthAmerica, 1); + insertTLDProps(TLD_SouthEastAsia, 2); + insertTLDProps(TLD_MiddleEastWestAsia, 3); + insertTLDProps(TLD_NorthAmericaOceania, 4); + insertTLDProps(TLD_Africa, 5); + insertTLDProps(TLD_Generic, 6); + insertTLDProps(TLD_Unassigned, 7); } + // the class object public kelondroTree urlHashCache; private HashSet existsIndex; @@ -128,13 +459,72 @@ public class plasmaURL { } catch (IOException e) {} } + public static final int flagTypeID(String hash) { + return (serverCodings.enhancedCoder.decodeBase64Byte(hash.charAt(11)) & 32) >> 5; + } + public static final int flagTLDID(String hash) { + return (serverCodings.enhancedCoder.decodeBase64Byte(hash.charAt(11)) & 28) >> 2; + } + public static final int flagLengthID(String hash) { + return (serverCodings.enhancedCoder.decodeBase64Byte(hash.charAt(11)) & 3); + } + + public static final String urlHash(String url) { + try { + return urlHash(new URL(url)); + } catch (MalformedURLException e) { + return null; + } + } public static final String urlHash(URL url) { + if (url == null) return null; + String host = url.getHost().toLowerCase(); + int p = host.lastIndexOf('.'); + String tld = "", dom = tld; + if (p > 0) { + tld = host.substring(p + 1); + dom = host.substring(0, p); + } + Integer ID = (Integer) TLDID.get(tld); + int id = (ID == null) ? 7 : ID.intValue(); + boolean isHTTP = url.getProtocol().equals("http"); + p = dom.lastIndexOf('.'); // locate subdomain + String subdom = ""; + if (p > 0) { + subdom = dom.substring(0, p); + dom = dom.substring(p + 1); + } + int port = url.getPort(); + if (port <= 0) port = (isHTTP) ? 80 : 21; + String path = url.getPath(); + if (path.startsWith("/")) path = path.substring(1); + if (path.endsWith("/")) path = path.substring(0, path.length() - 1); + p = path.indexOf('/'); + String rootpath = ""; + if (p > 0) { + rootpath = path.substring(0, p); + } + // we collected enough information to compute the fragments that are basis for hashes + int l = dom.length(); + int domlengthKey = (l <= 8) ? 0 : (l <= 12) ? 1 : (l <= 16) ? 2 : 3; + byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey); + // form the 'local' part of the hash + String hash3 = serverCodings.encodeMD5B64(htmlFilterContentScraper.urlNormalform(url), true).substring(0, 5); + char hash2 = serverCodings.encodeMD5B64(subdom + ":" + port + ":" + rootpath, true).charAt(0); + // form the 'global' part of the hash + String hash1 = serverCodings.encodeMD5B64(url.getProtocol() + ":" + host + ":" + port, true).substring(0, 5); + char hash0 = serverCodings.enhancedCoder.encodeBase64Byte(flagbyte); + // combine the hashes + return hash3 + hash2 + hash1 + hash0; + } + + public static final String oldurlHash(URL url) { if (url == null) return null; String hash = serverCodings.encodeMD5B64(htmlFilterContentScraper.urlNormalform(url), true).substring(0, urlHashLength); return hash; } - public static final String urlHash(String url) { + public static final String oldurlHash(String url) { if ((url == null) || (url.length() < 10)) return null; String hash = serverCodings.encodeMD5B64(htmlFilterContentScraper.urlNormalform(url), true).substring(0, urlHashLength); return hash; @@ -143,21 +533,5 @@ public class plasmaURL { public Iterator urlHashes(String urlHash, boolean up) throws IOException { return urlHashCache.rows(up, false, urlHash.getBytes()); } - - /* TLDs: - AC, AD, AE, AERO, AF, AG, AI, AL, AM, AN, AO, AQ, AR, ARPA, AS, AT, AU, AW, AZ, - BA, BB, BD, BE, BF, BG, BH, BI, BIZ, BJ, BM, BN, BO, BR, BS, BT, BV, BW, BY, BZ, - CA, CC, CD, CF, CG, CH, CI, CK, CL, CM, CN, CO, COM, COOP, CR, CU, CV, CX, CY, CZ, - DE, DJ, DK, DM, DO, DZ, EC, EDU, EE, EG, ER, ES, ET, EU, FI, FJ, FK, FM, FO, FR, - GA, GB, GD, GE, GF, GG, GH, GI, GL, GM, GN, GOV, GP, GQ, GR, GS, GT, GU, GW, GY, - HK, HM, HN, HR, HT, HU, ID, IE, IL, IM, IN, INFO, INT, IO, IQ, IR, IS, IT, - JE, JM, JO, JOBS, JP, KE, KG, KH, KI, KM, KN, KR, KW, KY, KZ, - LA, LB, LC, LI, LK, LR, LS, LT, LU, LV, LY, - MA, MC, MD, MG, MH, MIL, MK, ML, MM, MN, MO, MOBI, MP, MQ, MR, MS, MT, MU, MUSEUM, MV, MW, MX, MY, MZ, - NA, NAME, NC, NE, NET, NF, NG, NI, NL, NO, NP, NR, NU, NZ, OM, ORG, - PA, PE, PF, PG, PH, PK, PL, PM, PN, PR, PRO, PS, PT, PW, PY, QA, RE, RO, RU, RW, - SA, SB, SC, SD, SE, SG, SH, SI, SJ, SK, SL, SM, SN, SO, SR, ST, SU, SV, SY, SZ, - TC, TD, TF, TG, TH, TJ, TK, TL, TM, TN, TO, TP, TR, TRAVEL, TT, TV, TW, TZ, - UA, UG, UK, UM, US, UY, UZ, VA, VC, VE, VG, VI, VN, VU, WF, WS, YE, YT, YU, ZA, ZM, ZW - */ + } diff --git a/source/de/anomic/server/serverCodings.java b/source/de/anomic/server/serverCodings.java index 8dcf48f43..8f4eba5e3 100644 --- a/source/de/anomic/server/serverCodings.java +++ b/source/de/anomic/server/serverCodings.java @@ -78,6 +78,15 @@ public final class serverCodings { for (int i = 0; i < alpha.length; i++) ahpla[alpha[i]] = (byte) i; } + + public char encodeBase64Byte(byte b) { + return alpha[b]; + } + + public byte decodeBase64Byte(char b) { + return ahpla[b]; + } + public String encodeBase64Long(long c, int length) { if (length < 0) length = 0; StringBuffer s = new StringBuffer(length); //String s = "";