Factored code re-implementing DigestURL.hosthash() method.

This ensure consistent implementation of the url host hash generation
and easier usage finding in source code.

Also added a unit test for this function.
pull/105/head
luccioman 8 years ago
parent 86adfef30f
commit 0da1e6ba16

@ -33,7 +33,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.RequestHeader;
@ -117,7 +116,7 @@ public class WebStructurePicture_p {
String host = hostlist[i];
String hash = null;
try {
hash = ASCII.String((new DigestURL("http://" + host)).hash(), 6, 6);
hash = new DigestURL("http://" + host).hosthash();
} catch (final MalformedURLException e) {
continue;
}

@ -68,7 +68,7 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
else h = "http://" + h;
}
DigestURL url = new DigestURL(h);
return (url == null) ? null : ASCII.String(url.hash(), 6, 6);
return (url == null) ? null : url.hosthash();
}
/**
@ -209,8 +209,13 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
return this.hash;
}
/**
* <p>Extract a fragment of the url hash that can be used as a hash for the host name part of this url.</p>
* <p><strong>WARNING : two URLs with the same host name but different protocols or ports will produce two different host hashes with this method!</strong></p>
* @return a 6-byte hash fragment
*/
public String hosthash() {
return ASCII.String(this.hash(), 6, 6);
return ASCII.String(this.hash(), 6, 6);
}
/**
@ -333,10 +338,15 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
return hash.toString();
}
/**
* Compute a 6-byte hash fragment that can be used to identify the domain name of an url.
* @param host host name. Must not be null.
* @return 6 bytes base64 encoded String
*/
public static final String hosthash6(final String host) {
return hosthash6("http", host, 80);
}
//private static String[] testTLDs = new String[] { "com", "net", "org", "uk", "fr", "de", "es", "it" };
public static final int domLengthEstimation(final byte[] urlHashBytes) {

@ -200,9 +200,7 @@ public final class CrawlStacker {
Set<String> hosthashes = new HashSet<String>();
for (final AnchorURL url: hyperlinks) {
if (url == null) continue;
final byte[] urlhash = url.hash();
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
hosthashes.add(ASCII.String(hosthash));
hosthashes.add(url.hosthash());
}
this.nextQueue.errorURL.removeHosts(hosthashes);
}

@ -245,7 +245,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable<URIMe
public String hosthash() {
String hosthash = (String) this.getFieldValue(CollectionSchema.host_id_s.getSolrFieldName());
if (hosthash == null) hosthash = ASCII.String(this.url.hash(), 6, 6);
if (hosthash == null) hosthash = this.url.hosthash();
return hosthash;
}

@ -619,14 +619,14 @@ public class WebStructureGraph {
protected void learnrefs(final LearnObject lro) {
final DigestURL url = lro.url;
final String sourceHosthash = ASCII.String(url.hash(), 6, 6);
final String sourceHosthash = url.hosthash();
// parse the new reference string and join it with the stored references
final StructureEntry structure = outgoingReferences(sourceHosthash);
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
int c;
for (final DigestURL u : lro.globalRefURLs) {
String domain = ASCII.String(u.hash(), 6, 6);
String domain = u.hosthash();
if (Switchboard.getSwitchboard() != null && Switchboard.getSwitchboard().shallTerminate()) break;
if (!exists(domain)) {
// this must be recorded as an host with no references

@ -3242,9 +3242,8 @@ public final class Switchboard extends serverSwitch {
}
// remove the document from the error-db
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
Set<String> hosthashes = new HashSet<String>();
hosthashes.add(ASCII.String(hosthash));
hosthashes.add(url.hosthash());
this.crawlQueues.errorURL.removeHosts(hosthashes);
this.index.fulltext().remove(urlhash);

@ -3,8 +3,11 @@ package net.yacy.cora.document.id;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
import junit.framework.TestCase;
import net.yacy.cora.document.encoding.ASCII;
import org.junit.Assert;
import org.junit.Test;
public class DigestURLTest extends TestCase {
@ -32,6 +35,27 @@ public class DigestURLTest extends TestCase {
}
}
/**
* Test for {@link DigestURL#hosthash()}
*/
@Test
public void testHostHash() throws MalformedURLException {
/* Shortest example valid http url : protocol + domain name*/
String hostHash = new DigestURL("http://example.test").hosthash();
Assert.assertEquals("With path", hostHash, new DigestURL("http://example.test/path/").hosthash());
Assert.assertEquals("With resource", hostHash, new DigestURL("http://example.test/path/").hosthash());
Assert.assertEquals("With query parameters", hostHash, new DigestURL("http://example.test/path/").hosthash());
Assert.assertEquals("Document with anchor identifier", hostHash, new DigestURL("http://example.test/path/").hosthash());
/* The next two asserts would be debatable but reflect the current implementation */
Assert.assertNotEquals("Different protocol", hostHash, new DigestURL("https://example.test").hosthash());
Assert.assertNotEquals("Different port", hostHash, new DigestURL("http://example.test:8080").hosthash());
Assert.assertNotEquals("With a different TLD", hostHash, new DigestURL("http://example.net").hosthash());
}
/**
* Test hash() of DigestURL and File protocol to deliver same hash for

Loading…
Cancel
Save