From 0da1e6ba16e0bf3002557519fddd9951492530cd Mon Sep 17 00:00:00 2001 From: luccioman Date: Mon, 16 Jan 2017 10:18:42 +0100 Subject: [PATCH] Factored code re-implementing DigestURL.hosthash() method. This ensure consistent implementation of the url host hash generation and easier usage finding in source code. Also added a unit test for this function. --- htroot/WebStructurePicture_p.java | 3 +-- .../net/yacy/cora/document/id/DigestURL.java | 16 ++++++++++--- source/net/yacy/crawler/CrawlStacker.java | 4 +--- .../kelondro/data/meta/URIMetadataNode.java | 2 +- .../peers/graphics/WebStructureGraph.java | 4 ++-- source/net/yacy/search/Switchboard.java | 3 +-- .../yacy/cora/document/id/DigestURLTest.java | 24 +++++++++++++++++++ 7 files changed, 43 insertions(+), 13 deletions(-) diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index 12bf1affa..b9f2a82a9 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -33,7 +33,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.RequestHeader; @@ -117,7 +116,7 @@ public class WebStructurePicture_p { String host = hostlist[i]; String hash = null; try { - hash = ASCII.String((new DigestURL("http://" + host)).hash(), 6, 6); + hash = new DigestURL("http://" + host).hosthash(); } catch (final MalformedURLException e) { continue; } diff --git a/source/net/yacy/cora/document/id/DigestURL.java b/source/net/yacy/cora/document/id/DigestURL.java index 31efcaad3..6b5f60d3b 100644 --- a/source/net/yacy/cora/document/id/DigestURL.java +++ b/source/net/yacy/cora/document/id/DigestURL.java @@ -68,7 +68,7 @@ public class DigestURL extends MultiProtocolURL implements Serializable { else h = "http://" + h; } DigestURL url = new DigestURL(h); - return (url == null) ? null : ASCII.String(url.hash(), 6, 6); + return (url == null) ? null : url.hosthash(); } /** @@ -209,8 +209,13 @@ public class DigestURL extends MultiProtocolURL implements Serializable { return this.hash; } + /** + *

Extract a fragment of the url hash that can be used as a hash for the host name part of this url.

+ *

WARNING : two URLs with the same host name but different protocols or ports will produce two different host hashes with this method!

+ * @return a 6-byte hash fragment + */ public String hosthash() { - return ASCII.String(this.hash(), 6, 6); + return ASCII.String(this.hash(), 6, 6); } /** @@ -333,10 +338,15 @@ public class DigestURL extends MultiProtocolURL implements Serializable { return hash.toString(); } + /** + * Compute a 6-byte hash fragment that can be used to identify the domain name of an url. + * @param host host name. Must not be null. + * @return 6 bytes base64 encoded String + */ public static final String hosthash6(final String host) { return hosthash6("http", host, 80); } - + //private static String[] testTLDs = new String[] { "com", "net", "org", "uk", "fr", "de", "es", "it" }; public static final int domLengthEstimation(final byte[] urlHashBytes) { diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 8ae52a48f..5b66958ba 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -200,9 +200,7 @@ public final class CrawlStacker { Set hosthashes = new HashSet(); for (final AnchorURL url: hyperlinks) { if (url == null) continue; - final byte[] urlhash = url.hash(); - byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6); - hosthashes.add(ASCII.String(hosthash)); + hosthashes.add(url.hosthash()); } this.nextQueue.errorURL.removeHosts(hosthashes); } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 5dea4f9b4..5ca31c85a 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -245,7 +245,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable refs = (structure == null) ? new HashMap() : structure.references; int c; for (final DigestURL u : lro.globalRefURLs) { - String domain = ASCII.String(u.hash(), 6, 6); + String domain = u.hosthash(); if (Switchboard.getSwitchboard() != null && Switchboard.getSwitchboard().shallTerminate()) break; if (!exists(domain)) { // this must be recorded as an host with no references diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 38edaa3e0..1265086f2 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3242,9 +3242,8 @@ public final class Switchboard extends serverSwitch { } // remove the document from the error-db - byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6); Set hosthashes = new HashSet(); - hosthashes.add(ASCII.String(hosthash)); + hosthashes.add(url.hosthash()); this.crawlQueues.errorURL.removeHosts(hosthashes); this.index.fulltext().remove(urlhash); diff --git a/test/java/net/yacy/cora/document/id/DigestURLTest.java b/test/java/net/yacy/cora/document/id/DigestURLTest.java index 7756376f9..5557fa76b 100644 --- a/test/java/net/yacy/cora/document/id/DigestURLTest.java +++ b/test/java/net/yacy/cora/document/id/DigestURLTest.java @@ -3,8 +3,11 @@ package net.yacy.cora.document.id; import java.net.MalformedURLException; import java.util.HashSet; import java.util.Set; + import junit.framework.TestCase; import net.yacy.cora.document.encoding.ASCII; + +import org.junit.Assert; import org.junit.Test; public class DigestURLTest extends TestCase { @@ -32,6 +35,27 @@ public class DigestURLTest extends TestCase { } } + + /** + * Test for {@link DigestURL#hosthash()} + */ + @Test + public void testHostHash() throws MalformedURLException { + /* Shortest example valid http url : protocol + domain name*/ + String hostHash = new DigestURL("http://example.test").hosthash(); + + Assert.assertEquals("With path", hostHash, new DigestURL("http://example.test/path/").hosthash()); + Assert.assertEquals("With resource", hostHash, new DigestURL("http://example.test/path/").hosthash()); + Assert.assertEquals("With query parameters", hostHash, new DigestURL("http://example.test/path/").hosthash()); + Assert.assertEquals("Document with anchor identifier", hostHash, new DigestURL("http://example.test/path/").hosthash()); + + /* The next two asserts would be debatable but reflect the current implementation */ + Assert.assertNotEquals("Different protocol", hostHash, new DigestURL("https://example.test").hosthash()); + Assert.assertNotEquals("Different port", hostHash, new DigestURL("http://example.test:8080").hosthash()); + + Assert.assertNotEquals("With a different TLD", hostHash, new DigestURL("http://example.net").hosthash()); + + } /** * Test hash() of DigestURL and File protocol to deliver same hash for