corrected a design mistake (5-byte hashes not necessary)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5119 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent c97d0fcee7
commit d09ddabd09

@ -277,7 +277,7 @@ public class CrawlResults {
prop.put("table_domains_" + cnt + "_feedbackpage", "CrawlResults.html");
prop.put("table_domains_" + cnt + "_tabletype", tabletype);
prop.put("table_domains_" + cnt + "_domain", domain);
prop.put("table_domains_" + cnt + "_hashpart", yacyURL.hosthash(domain));
prop.put("table_domains_" + cnt + "_hashpart", yacyURL.hosthash6(domain));
prop.put("table_domains_" + cnt + "_count", sb.crawlResults.domainCount(tabletype, domain));
dark = !dark;
cnt++;

@ -200,10 +200,10 @@ public final class ResultURLs {
}
public int deleteDomain(final int stack, String host, String hosthash) {
assert hosthash.length() == 5;
assert hosthash.length() == 6;
int i = 0;
while (i < getStackSize(stack)) {
if (getUrlHash(stack, i).substring(6, 11).equals(hosthash)) getStack(stack).remove(i); else i++;
if (getUrlHash(stack, i).substring(6).equals(hosthash)) getStack(stack).remove(i); else i++;
}
assert host != null : "host = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";

@ -504,7 +504,7 @@ public final class indexRepositoryReference {
hashStat ds;
if (i != null) while (i.hasNext()) {
urlhash = new String(i.next());
hosthash = urlhash.substring(6, 11);
hosthash = urlhash.substring(6);
ds = map.get(hosthash);
if (ds == null) {
ds = new hashStat(urlhash);
@ -562,7 +562,7 @@ public final class indexRepositoryReference {
urlref = this.load(urlhash, null, 0);
if (urlref == null || urlref.comp() == null || urlref.comp().url() == null || urlref.comp().url().getHost() == null) continue;
if (statsDump == null) return new ArrayList<hostStat>().iterator(); // some other operation has destroyed the object
statsDump.add(new hostStat(urlref.comp().url().getHost(), urlhash.substring(6, 11), s.getScore(urlhash)));
statsDump.add(new hostStat(urlref.comp().url().getHost(), urlhash.substring(6), s.getScore(urlhash)));
count--;
if (count == 0) break;
}
@ -583,7 +583,7 @@ public final class indexRepositoryReference {
public String hostname, hosthash;
public int count;
public hostStat(String host, String urlhashfragment, int count) {
assert urlhashfragment.length() == 5;
assert urlhashfragment.length() == 6;
this.hostname = host;
this.hosthash = urlhashfragment;
this.count = count;
@ -599,13 +599,13 @@ public final class indexRepositoryReference {
*/
public int deleteDomain(String hosthash) throws IOException {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 5;
assert hosthash.length() == 6;
ArrayList<String> l = new ArrayList<String>();
kelondroCloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
String hash;
while (i.hasNext()) {
hash = new String(i.next());
if (hosthash.equals(hash.substring(6, 11))) l.add(hash);
if (hosthash.equals(hash.substring(6))) l.add(hash);
}
// then delete the urls using this list

@ -565,30 +565,6 @@ public class yacyURL implements Serializable {
this.getHost().toLowerCase() + ((defaultPort) ? ("") : (":" + this.port)) + path;
}
// public boolean equals(final Object other) {
// if(other != null && other instanceof yacyURL) {
// final yacyURL otherURL = (yacyURL) other;
// return (((this.protocol == otherURL.protocol) || (this.protocol.equals(otherURL.protocol))) &&
// ((this.host == otherURL.host ) || (this.host.equals(otherURL.host))) &&
// ((this.userInfo == otherURL.userInfo) || (this.userInfo.equals(otherURL.userInfo))) &&
// ((this.path == otherURL.path ) || (this.path.equals(otherURL.path))) &&
// ((this.quest == otherURL.quest ) || (this.quest.equals(otherURL.quest))) &&
// ((this.ref == otherURL.ref ) || (this.ref.equals(otherURL.ref))) &&
// ((this.port == otherURL.port )));
// }
// return super.equals(other);
// }
//
// /**
// * hash code computation for yacyURL: please don't mix this up with the YaCy-Hash
// * this hash here is only used by hashing data structures, like a HashMap
// * We do not use tha yacy hash here, because this needs the computation of a DNS
// * which is very time-intensive
// */
// public int hashCode() {
// return this.toNormalform(true, false).hashCode();
// }
/* (non-Javadoc)
* @see java.lang.Object#hashCode()
*/
@ -737,7 +713,7 @@ public class yacyURL implements Serializable {
hash.append(kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(toNormalform(true, true))).substring(0, 5)); // 5 chars
hash.append(subdomPortPath(subdom, port, rootpath)); // 1 char
// form the 'global' part of the hash
hash.append(hosthash(this.protocol, host, port)); // 5 chars
hash.append(hosthash5(this.protocol, host, port)); // 5 chars
hash.append(kelondroBase64Order.enhancedCoder.encodeByte(flagbyte)); // 1 char
// return result hash
@ -755,25 +731,38 @@ public class yacyURL implements Serializable {
return (urlHash.charAt(5) == rootURLFlag0) || (urlHash.charAt(5) == rootURLFlag1);
}
private static final String hosthash5(final String protocol, final String host, final int port) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5);
}
/**
* compute a 5-byte hash fragment that can be used to identify the domain of the url
* compute a 6-byte hash fragment that can be used to identify the domain of the url
* @param protocol
* @param host
* @param port
* @return 5 bytes base64 encoded String representing the domain of the url
* @return 6 bytes base64 encoded String representing the domain of the url
*/
public static final String hosthash(final String protocol, final String host, final int port) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5);
public static final String hosthash6(final String protocol, final String host, final int port) {
final StringBuilder hash = new StringBuilder(12);
final int id = serverDomains.getDomainID(host); // id=7: tld is local
int p = host.lastIndexOf('.');
String dom = (p > 0) ? dom = host.substring(0, p) : "";
p = dom.lastIndexOf('.');
if (p > 0) dom = dom.substring(p + 1);
final int l = dom.length();
final int domlengthKey = (l <= 8) ? 0 : (l <= 12) ? 1 : (l <= 16) ? 2 : 3;
final byte flagbyte = (byte) (((protocol.equals("http")) ? 0 : 32) | (id << 2) | domlengthKey);
hash.append(hosthash5(protocol, host, port)); // 5 chars
hash.append(kelondroBase64Order.enhancedCoder.encodeByte(flagbyte)); // 1 char
// return result hash
return hash.toString();
}
public static final String hosthash(final String host) {
return hosthash("http", host, 80);
public static final String hosthash6(final String host) {
return hosthash6("http", host, 80);
}
public final String hosthash() {
return this.hash().substring(6, 11);
}
private static String[] testTLDs = new String[] { "com", "net", "org", "uk", "fr", "de", "es", "it" };
public static final yacyURL probablyWordURL(final String urlHash, final TreeSet<String> words) {
@ -784,7 +773,7 @@ public class yacyURL implements Serializable {
if ((word == null) || (word.length() == 0)) continue;
final String pattern = urlHash.substring(6, 11);
for (int i = 0; i < testTLDs.length; i++) {
if (pattern.equals(hosthash("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80)))
if (pattern.equals(hosthash5("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80)))
try {
return new yacyURL("http://www." + word.toLowerCase() + "." + testTLDs[i], null);
} catch (final MalformedURLException e) {

@ -458,9 +458,9 @@ public final class yacy {
startupFinishedSync.V();
}
serverLog.logConfig("SHUTDOWN", "goodbye. (this is the last line)");
//try {
// System.exit(0);
//} catch (Exception e) {} // was once stopped by de.anomic.net.ftpc$sm.checkExit(ftpc.java:1790)
try {
System.exit(0);
} catch (Exception e) {} // was once stopped by de.anomic.net.ftpc$sm.checkExit(ftpc.java:1790)
}
/**

Loading…
Cancel
Save