diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java
index 12bf1affa..b9f2a82a9 100644
--- a/htroot/WebStructurePicture_p.java
+++ b/htroot/WebStructurePicture_p.java
@@ -33,7 +33,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.RequestHeader;
@@ -117,7 +116,7 @@ public class WebStructurePicture_p {
String host = hostlist[i];
String hash = null;
try {
- hash = ASCII.String((new DigestURL("http://" + host)).hash(), 6, 6);
+ hash = new DigestURL("http://" + host).hosthash();
} catch (final MalformedURLException e) {
continue;
}
diff --git a/source/net/yacy/cora/document/id/DigestURL.java b/source/net/yacy/cora/document/id/DigestURL.java
index 31efcaad3..6b5f60d3b 100644
--- a/source/net/yacy/cora/document/id/DigestURL.java
+++ b/source/net/yacy/cora/document/id/DigestURL.java
@@ -68,7 +68,7 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
else h = "http://" + h;
}
DigestURL url = new DigestURL(h);
- return (url == null) ? null : ASCII.String(url.hash(), 6, 6);
+ return (url == null) ? null : url.hosthash();
}
/**
@@ -209,8 +209,13 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
return this.hash;
}
+ /**
+ *
Extract a fragment of the url hash that can be used as a hash for the host name part of this url.
+ * WARNING : two URLs with the same host name but different protocols or ports will produce two different host hashes with this method!
+ * @return a 6-byte hash fragment
+ */
public String hosthash() {
- return ASCII.String(this.hash(), 6, 6);
+ return ASCII.String(this.hash(), 6, 6);
}
/**
@@ -333,10 +338,15 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
return hash.toString();
}
+ /**
+ * Compute a 6-byte hash fragment that can be used to identify the domain name of an url.
+ * @param host host name. Must not be null.
+ * @return 6 bytes base64 encoded String
+ */
public static final String hosthash6(final String host) {
return hosthash6("http", host, 80);
}
-
+
//private static String[] testTLDs = new String[] { "com", "net", "org", "uk", "fr", "de", "es", "it" };
public static final int domLengthEstimation(final byte[] urlHashBytes) {
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index 8ae52a48f..5b66958ba 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -200,9 +200,7 @@ public final class CrawlStacker {
Set hosthashes = new HashSet();
for (final AnchorURL url: hyperlinks) {
if (url == null) continue;
- final byte[] urlhash = url.hash();
- byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
- hosthashes.add(ASCII.String(hosthash));
+ hosthashes.add(url.hosthash());
}
this.nextQueue.errorURL.removeHosts(hosthashes);
}
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
index 5dea4f9b4..5ca31c85a 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@@ -245,7 +245,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable refs = (structure == null) ? new HashMap() : structure.references;
int c;
for (final DigestURL u : lro.globalRefURLs) {
- String domain = ASCII.String(u.hash(), 6, 6);
+ String domain = u.hosthash();
if (Switchboard.getSwitchboard() != null && Switchboard.getSwitchboard().shallTerminate()) break;
if (!exists(domain)) {
// this must be recorded as an host with no references
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 38edaa3e0..1265086f2 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -3242,9 +3242,8 @@ public final class Switchboard extends serverSwitch {
}
// remove the document from the error-db
- byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
Set hosthashes = new HashSet();
- hosthashes.add(ASCII.String(hosthash));
+ hosthashes.add(url.hosthash());
this.crawlQueues.errorURL.removeHosts(hosthashes);
this.index.fulltext().remove(urlhash);
diff --git a/test/java/net/yacy/cora/document/id/DigestURLTest.java b/test/java/net/yacy/cora/document/id/DigestURLTest.java
index 7756376f9..5557fa76b 100644
--- a/test/java/net/yacy/cora/document/id/DigestURLTest.java
+++ b/test/java/net/yacy/cora/document/id/DigestURLTest.java
@@ -3,8 +3,11 @@ package net.yacy.cora.document.id;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
+
import junit.framework.TestCase;
import net.yacy.cora.document.encoding.ASCII;
+
+import org.junit.Assert;
import org.junit.Test;
public class DigestURLTest extends TestCase {
@@ -32,6 +35,27 @@ public class DigestURLTest extends TestCase {
}
}
+
+ /**
+ * Test for {@link DigestURL#hosthash()}
+ */
+ @Test
+ public void testHostHash() throws MalformedURLException {
+ /* Shortest example valid http url : protocol + domain name*/
+ String hostHash = new DigestURL("http://example.test").hosthash();
+
+ Assert.assertEquals("With path", hostHash, new DigestURL("http://example.test/path/").hosthash());
+ Assert.assertEquals("With resource", hostHash, new DigestURL("http://example.test/path/").hosthash());
+ Assert.assertEquals("With query parameters", hostHash, new DigestURL("http://example.test/path/").hosthash());
+ Assert.assertEquals("Document with anchor identifier", hostHash, new DigestURL("http://example.test/path/").hosthash());
+
+ /* The next two asserts would be debatable but reflect the current implementation */
+ Assert.assertNotEquals("Different protocol", hostHash, new DigestURL("https://example.test").hosthash());
+ Assert.assertNotEquals("Different port", hostHash, new DigestURL("http://example.test:8080").hosthash());
+
+ Assert.assertNotEquals("With a different TLD", hostHash, new DigestURL("http://example.net").hosthash());
+
+ }
/**
* Test hash() of DigestURL and File protocol to deliver same hash for