introduced new url-hashes for better ranking computation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1013 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent aadace1285
commit 5f68b6886b

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.411
releaseVersion=0.412
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -257,21 +257,30 @@ public final class plasmaSearchResult {
}
public static void main(String[] args) {
URL[] urls = new URL[6];
URL[] urls = new URL[10];
try {
urls[0] = new URL("http://www.yacy.net");
urls[1] = new URL("http://www.yacy.net/");
urls[2] = new URL("http://www.yacy.net/index.html");
urls[3] = new URL("http://www.yacy.net/yacy");
urls[4] = new URL("http://www.yacy.net/yacy/");
urls[5] = new URL("http://www.yacy.net/yacy/index.html");
String[] paths1 = new String[6]; for (int i = 0; i < 6; i++) {
urls[1] = new URL("http://www.yacy.de/");
urls[2] = new URL("http://yacy.net/");
urls[3] = new URL("http://www.yacy.net:80/");
urls[4] = new URL("http://yacy.net:80/");
urls[5] = new URL("http://www.yacy.net/index.html");
urls[6] = new URL("http://www.yacy.net/yacy");
urls[7] = new URL("http://www.yacy.net/yacy/");
urls[8] = new URL("http://www.yacy.net/yacy/index.html");
urls[9] = new URL("ftp://www.yacy.net/yacy/index.html");
String hash, fill;
String[] paths1 = new String[urls.length]; for (int i = 0; i < urls.length; i++) {
fill = ""; for (int j = 0; j < 35 - urls[i].toString().length(); j++) fill +=" ";
paths1[i] = urlPath(urls[i]);
System.out.println("paths1[" + i + "] = " + paths1[i]);
hash = plasmaURL.urlHash(urls[i]);
System.out.println("paths1[" + urls[i] + fill +"] = " + hash + ", typeID=" + plasmaURL.flagTypeID(hash) + ", tldID=" + plasmaURL.flagTLDID(hash) + ", lengthID=" + plasmaURL.flagLengthID(hash) + " / " + paths1[i]);
}
String[] paths2 = new String[6]; for (int i = 0; i < 6; i++) {
String[] paths2 = new String[urls.length]; for (int i = 0; i < urls.length; i++) {
fill = ""; for (int j = 0; j < 35 - urls[i].toString().length(); j++) fill +=" ";
paths2[i] = shortenPath(paths1[i]);
System.out.println("paths2[" + i + "] = " + paths2[i]);
hash = plasmaURL.urlHash(urls[i]);
System.out.println("paths2[" + urls[i] + fill + "] = " + hash + ", typeID=" + plasmaURL.flagTypeID(hash) + ", tldID=" + plasmaURL.flagTLDID(hash) + ", lengthID=" + plasmaURL.flagLengthID(hash) + " / " + paths2[i]);
}
} catch (MalformedURLException e) {
e.printStackTrace();

@ -43,8 +43,10 @@ package de.anomic.plasma;
import java.io.IOException;
import java.net.URL;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -79,17 +81,346 @@ public class plasmaURL {
public static final int urlHostLength = 8; // the host as struncated name
public static final int urlHandleLength = 4; // a handle
private static final String[] TLD_NorthAmericaOceania={
// primary english-speaking countries
// english-speaking countries from central america are also included
// includes also dutch and french colonies in the caribbean sea
"EDU=US Educational",
"GOV=US Government",
"MIL=US Military",
"NET=Network",
"ORG=Non-Profit Organization",
"AG=Antigua and Barbuda",
"AI=Anguilla",
"AU=Australia",
"BB=Barbados",
"BZ=Belize",
"BM=Bermuda",
"BS=Bahamas",
"CA=Canada",
"DM=Dominica",
"GD=Grenada",
"GP=Guadeloupe",
"KY=Cayman Islands",
"NZ=New Zealand (Aotearoa)",
"PM=St. Pierre and Miquelon",
"US=United States",
"VC=Saint Vincent and the Grenadines",
"VG=Virgin Islands (British)",
"VI=Virgin Islands (U.S.)",
"VU=Vanuatu",
"WF=Wallis and Futuna Islands",
"WS=Samoa"
};
private static final String[] TLD_MiddleSouthAmerica = {
// primary spanish and portugese-speaking
"AR=Argentina",
"AW=Aruba",
"BR=Brazil",
"BO=Bolivia",
"CL=Chile",
"CO=Colombia",
"CR=Costa Rica",
"CU=Cuba",
"DO=Dominican Republic",
"EC=Ecuador",
"GF=French Guiana",
"FK=Falkland Islands (Malvinas)",
"GY=Guyana",
"HN=Honduras",
"JM=Jamaica",
"MX=Mexico",
"NI=Nicaragua",
"PA=Panama",
"PE=Peru",
"PY=Paraguay",
"SR=Suriname",
"SV=El Salvador",
"UY=Uruguay",
"VE=Venezuela"
};
private static final String[] TLD_EuropaRussia = {
// includes also countries that are mainly french- dutch- speaking
// and culturally close to europe
"AD=Andorra",
"AL=Albania",
"AT=Austria",
"BA=Bosnia and Herzegovina",
"BE=Belgium",
"BG=Bulgaria",
"CH=Switzerland",
"CS=Czechoslovakia (former)",
"CZ=Czech Republic",
"CY=Cyprus",
"DE=Germany",
"DK=Denmark",
"ES=Spain",
"EE=Estonia",
"FI=Finland",
"FR=France",
"FX=France, Metropolitan",
"GB=Great Britain (UK)",
"GI=Gibraltar",
"GL=Greenland",
"GR=Greece",
"HR=Croatia (Hrvatska)",
"HU=Hungary",
"IE=Ireland",
"IS=Iceland",
"IT=Italy",
"LI=Liechtenstein",
"LT=Lithuania",
"LU=Luxembourg",
"LV=Latvia",
"MD=Moldova",
"MC=Monaco",
"MK=Macedonia",
"MN=Mongolia",
"MT=Malta",
"NATO=Nato field",
"NL=Netherlands",
"NO=Norway",
"PL=Poland",
"PT=Portugal",
"RO=Romania",
"RU=Russia",
"SE=Sweden",
"SI=Slovenia",
"SK=Slovak Republic",
"SU=USSR (former)",
"UK=United Kingdom",
"VA=Vatican City State (Holy See)",
"YU=Yugoslavia"
};
private static final String[] TLD_MiddleEastWestAsia = {
"AE=United Arab Emirates",
"AF=Afghanistan",
"AZ=Azerbaijan",
"BH=Bahrain",
"IL=Israel",
"IQ=Iraq",
"IR=Iran",
"PK=Pakistan",
"YE=Yemen"
};
private static final String[] TLD_SouthEastAsia = {
"BD=Bangladesh",
"BT=Bhutan",
"CN=China",
"HK=Hong Kong",
"ID=Indonesia",
"IN=India",
"NP=Nepal",
"JP=Japan",
"KH=Cambodia",
"KP=Korea (North)",
"KR=Korea (South)",
"LK=Sri Lanka",
"SG=Singapore",
"VN=Viet Nam"
};
private static final String[] TLD_Africa = {
"AO=Angola",
"BF=Burkina Faso",
"BI=Burundi",
"BJ=Benin",
"BW=Botswana",
"CF=Central African Republic",
"CG=Congo",
"CI=Cote D'Ivoire (Ivory Coast)",
"CM=Cameroon",
"DZ=Algeria",
"EG=Egypt",
"EH=Western Sahara",
"ER=Eritrea",
"ET=Ethiopia",
"GA=Gabon",
"GH=Ghana",
"GM=Gambia",
"GN=Guinea",
"KE=Kenya",
"LR=Liberia",
"LS=Lesotho",
"LY=Libya",
"MA=Morocco",
"MG=Madagascar",
"ML=Mali",
"MR=Mauritania",
"MU=Mauritius",
"MW=Malawi",
"MZ=Mozambique",
"NA=Namibia",
"NE=Niger",
"NG=Nigeria",
"RE=Reunion",
"RW=Rwanda",
"SH=St. Helena",
"SL=Sierra Leone",
"SN=Senegal",
"SO=Somalia",
"ST=Sao Tome and Principe",
"SZ=Swaziland",
"TG=Togo",
"TN=Tunisia",
"TZ=Tanzania",
"UG=Uganda",
"ZA=South Africa",
"ZM=Zambia",
"ZR=Zaire",
"ZW=Zimbabwe",
"YT=Mayotte"
};
private static final String[] TLD_Generic = {
"COM=US Commercial",
"AERO=",
"BIZ=",
"COOP=",
"INFO=",
"MUSEUM=",
"NAME=",
"PRO=",
"ARPA=",
"INT=International",
"ARPA=Arpanet"
};
private static final String[] TLD_Unassigned = {
"AQ=Antarctica",
"NT=Neutral Zone"
};
/*
http://www.odci.gov/cia/publications/factbook/
http://en.wikipedia.org/wiki/List_of_countries_by_continent
"AM=Armenia",
"AN=Netherlands Antilles",
"AS=American Samoa",
"BN=Brunei Darussalam",
"BV=Bouvet Island",
"BY=Belarus",
"CC=Cocos (Keeling) Islands",
"CK=Cook Islands",
"CV=Cape Verde",
"CX=Christmas Island",
"DJ=Djibouti",
"FJ=Fiji",
"FM=Micronesia",
"FO=Faroe Islands",
"GE=Georgia",
"GQ=Equatorial Guinea",
"GS=S. Georgia and S. Sandwich Isls.",
"GT=Guatemala",
"GU=Guam",
"GW=Guinea-Bissau",
"HM=Heard and McDonald Islands",
"HT=Haiti",
"IO=British Indian Ocean Territory",
"JO=Jordan",
"KG=Kyrgyzstan",
"KI=Kiribati",
"KM=Comoros",
"KN=Saint Kitts and Nevis",
"KW=Kuwait",
"KZ=Kazakhstan",
"LA=Laos",
"LB=Lebanon",
"LC=Saint Lucia",
"MH=Marshall Islands",
"MM=Myanmar",
"MO=Macau",
"MP=Northern Mariana Islands",
"MQ=Martinique",
"MS=Montserrat",
"MV=Maldives",
"MY=Malaysia",
"NC=New Caledonia",
"NF=Norfolk Island",
"NR=Nauru",
"NU=Niue",
"OM=Oman",
"PF=French Polynesia",
"PG=Papua New Guinea",
"PH=Philippines",
"PN=Pitcairn",
"PR=Puerto Rico",
"PW=Palau",
"QA=Qatar",
"SA=Saudi Arabia",
"Sb=Solomon Islands",
"SC=Seychelles",
"SD=Sudan",
"SJ=Svalbard and Jan Mayen Islands",
"SM=San Marino",
"SY=Syria",
"TC=Turks and Caicos Islands",
"TD=Chad",
"TF=French Southern Territories",
"TH=Thailand",
"TJ=Tajikistan",
"TK=Tokelau",
"TM=Turkmenistan",
"TO=Tonga",
"TP=East Timor",
"TR=Turkey",
"TT=Trinidad and Tobago",
"TV=Tuvalu",
"TW=Taiwan",
"UA=Ukraine",
"UM=US Minor Outlying Islands",
"UZ=Uzbekistan",
*/
/* nw data fields to become valid after migration
* age of page at time of load
/* TLDs:
aero, biz, com, coop, edu, gov, info, int, mil, museum, name, net, org, pro, arpa
AC, AD, AE, AERO, AF, AG, AI, AL, AM, AN, AO, AQ, AR, ARPA, AS, AT, AU, AW, AZ,
BA, BB, BD, BE, BF, BG, BH, BI, BIZ, BJ, BM, BN, BO, BR, BS, BT, BV, BW, BY, BZ,
CA, CC, CD, CF, CG, CH, CI, CK, CL, CM, CN, CO, COM, COOP, CR, CU, CV, CX, CY, CZ,
DE, DJ, DK, DM, DO, DZ, EC, EDU, EE, EG, ER, ES, ET, EU, FI, FJ, FK, FM, FO, FR,
GA, GB, GD, GE, GF, GG, GH, GI, GL, GM, GN, GOV, GP, GQ, GR, GS, GT, GU, GW, GY,
HK, HM, HN, HR, HT, HU, ID, IE, IL, IM, IN, INFO, INT, IO, IQ, IR, IS, IT,
JE, JM, JO, JOBS, JP, KE, KG, KH, KI, KM, KN, KR, KW, KY, KZ,
LA, LB, LC, LI, LK, LR, LS, LT, LU, LV, LY,
MA, MC, MD, MG, MH, MIL, MK, ML, MM, MN, MO, MOBI, MP, MQ, MR, MS, MT, MU, MUSEUM, MV, MW, MX, MY, MZ,
NA, NAME, NC, NE, NET, NF, NG, NI, NL, NO, NP, NR, NU, NZ, OM, ORG,
PA, PE, PF, PG, PH, PK, PL, PM, PN, PR, PRO, PS, PT, PW, PY, QA, RE, RO, RU, RW,
SA, SB, SC, SD, SE, SG, SH, SI, SJ, SK, SL, SM, SN, SO, SR, ST, SU, SV, SY, SZ,
TC, TD, TF, TG, TH, TJ, TK, TL, TM, TN, TO, TP, TR, TRAVEL, TT, TV, TW, TZ,
UA, UG, UK, UM, US, UY, UZ, VA, VC, VE, VG, VI, VN, VU, WF, WS, YE, YT, YU, ZA, ZM, ZW
*/
public static String dummyHash;
private static HashMap TLDID = new HashMap();
private static HashMap TLDName = new HashMap();
private static void insertTLDProps(String[] TLDList, int id) {
int p;
String tld, name;
Integer ID = new Integer(id);
for (int i = 0; i < TLDList.length; i++) {
p = TLDList[i].indexOf('=');
if (p > 0) {
tld = TLDList[i].substring(0, p).toLowerCase();
name = TLDList[i].substring(p + 1);
TLDID.put(tld, ID);
TLDName.put(tld, name);
}
}
}
static {
// create a dummy hash
dummyHash = "";
for (int i = 0; i < urlHashLength; i++) dummyHash += "-";
// assign TLD-ids and names
insertTLDProps(TLD_EuropaRussia, 0);
insertTLDProps(TLD_MiddleSouthAmerica, 1);
insertTLDProps(TLD_SouthEastAsia, 2);
insertTLDProps(TLD_MiddleEastWestAsia, 3);
insertTLDProps(TLD_NorthAmericaOceania, 4);
insertTLDProps(TLD_Africa, 5);
insertTLDProps(TLD_Generic, 6);
insertTLDProps(TLD_Unassigned, 7);
}
// the class object
public kelondroTree urlHashCache;
private HashSet existsIndex;
@ -128,13 +459,72 @@ public class plasmaURL {
} catch (IOException e) {}
}
public static final int flagTypeID(String hash) {
return (serverCodings.enhancedCoder.decodeBase64Byte(hash.charAt(11)) & 32) >> 5;
}
public static final int flagTLDID(String hash) {
return (serverCodings.enhancedCoder.decodeBase64Byte(hash.charAt(11)) & 28) >> 2;
}
public static final int flagLengthID(String hash) {
return (serverCodings.enhancedCoder.decodeBase64Byte(hash.charAt(11)) & 3);
}
public static final String urlHash(String url) {
try {
return urlHash(new URL(url));
} catch (MalformedURLException e) {
return null;
}
}
public static final String urlHash(URL url) {
if (url == null) return null;
String host = url.getHost().toLowerCase();
int p = host.lastIndexOf('.');
String tld = "", dom = tld;
if (p > 0) {
tld = host.substring(p + 1);
dom = host.substring(0, p);
}
Integer ID = (Integer) TLDID.get(tld);
int id = (ID == null) ? 7 : ID.intValue();
boolean isHTTP = url.getProtocol().equals("http");
p = dom.lastIndexOf('.'); // locate subdomain
String subdom = "";
if (p > 0) {
subdom = dom.substring(0, p);
dom = dom.substring(p + 1);
}
int port = url.getPort();
if (port <= 0) port = (isHTTP) ? 80 : 21;
String path = url.getPath();
if (path.startsWith("/")) path = path.substring(1);
if (path.endsWith("/")) path = path.substring(0, path.length() - 1);
p = path.indexOf('/');
String rootpath = "";
if (p > 0) {
rootpath = path.substring(0, p);
}
// we collected enough information to compute the fragments that are basis for hashes
int l = dom.length();
int domlengthKey = (l <= 8) ? 0 : (l <= 12) ? 1 : (l <= 16) ? 2 : 3;
byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey);
// form the 'local' part of the hash
String hash3 = serverCodings.encodeMD5B64(htmlFilterContentScraper.urlNormalform(url), true).substring(0, 5);
char hash2 = serverCodings.encodeMD5B64(subdom + ":" + port + ":" + rootpath, true).charAt(0);
// form the 'global' part of the hash
String hash1 = serverCodings.encodeMD5B64(url.getProtocol() + ":" + host + ":" + port, true).substring(0, 5);
char hash0 = serverCodings.enhancedCoder.encodeBase64Byte(flagbyte);
// combine the hashes
return hash3 + hash2 + hash1 + hash0;
}
public static final String oldurlHash(URL url) {
if (url == null) return null;
String hash = serverCodings.encodeMD5B64(htmlFilterContentScraper.urlNormalform(url), true).substring(0, urlHashLength);
return hash;
}
public static final String urlHash(String url) {
public static final String oldurlHash(String url) {
if ((url == null) || (url.length() < 10)) return null;
String hash = serverCodings.encodeMD5B64(htmlFilterContentScraper.urlNormalform(url), true).substring(0, urlHashLength);
return hash;
@ -143,21 +533,5 @@ public class plasmaURL {
public Iterator urlHashes(String urlHash, boolean up) throws IOException {
return urlHashCache.rows(up, false, urlHash.getBytes());
}
/* TLDs:
AC, AD, AE, AERO, AF, AG, AI, AL, AM, AN, AO, AQ, AR, ARPA, AS, AT, AU, AW, AZ,
BA, BB, BD, BE, BF, BG, BH, BI, BIZ, BJ, BM, BN, BO, BR, BS, BT, BV, BW, BY, BZ,
CA, CC, CD, CF, CG, CH, CI, CK, CL, CM, CN, CO, COM, COOP, CR, CU, CV, CX, CY, CZ,
DE, DJ, DK, DM, DO, DZ, EC, EDU, EE, EG, ER, ES, ET, EU, FI, FJ, FK, FM, FO, FR,
GA, GB, GD, GE, GF, GG, GH, GI, GL, GM, GN, GOV, GP, GQ, GR, GS, GT, GU, GW, GY,
HK, HM, HN, HR, HT, HU, ID, IE, IL, IM, IN, INFO, INT, IO, IQ, IR, IS, IT,
JE, JM, JO, JOBS, JP, KE, KG, KH, KI, KM, KN, KR, KW, KY, KZ,
LA, LB, LC, LI, LK, LR, LS, LT, LU, LV, LY,
MA, MC, MD, MG, MH, MIL, MK, ML, MM, MN, MO, MOBI, MP, MQ, MR, MS, MT, MU, MUSEUM, MV, MW, MX, MY, MZ,
NA, NAME, NC, NE, NET, NF, NG, NI, NL, NO, NP, NR, NU, NZ, OM, ORG,
PA, PE, PF, PG, PH, PK, PL, PM, PN, PR, PRO, PS, PT, PW, PY, QA, RE, RO, RU, RW,
SA, SB, SC, SD, SE, SG, SH, SI, SJ, SK, SL, SM, SN, SO, SR, ST, SU, SV, SY, SZ,
TC, TD, TF, TG, TH, TJ, TK, TL, TM, TN, TO, TP, TR, TRAVEL, TT, TV, TW, TZ,
UA, UG, UK, UM, US, UY, UZ, VA, VC, VE, VG, VI, VN, VU, WF, WS, YE, YT, YU, ZA, ZM, ZW
*/
}

@ -78,6 +78,15 @@ public final class serverCodings {
for (int i = 0; i < alpha.length; i++) ahpla[alpha[i]] = (byte) i;
}
public char encodeBase64Byte(byte b) {
return alpha[b];
}
public byte decodeBase64Byte(char b) {
return ahpla[b];
}
public String encodeBase64Long(long c, int length) {
if (length < 0) length = 0;
StringBuffer s = new StringBuffer(length); //String s = "";

Loading…
Cancel
Save