Updated the generic top-level known domains list.

Using current IANA reference list at
https://www.iana.org/domains/root/db

The generated URL hashes on these domains stay the same but performance
is greatly improved as a DNS resolve request is required on URL hash
computation when the TLD part of the host name is unknown.

Hash computation mean time measured on 1541 sample URLs (one on each
TLD) and a computer with a DSL connection : about 230ms before change,
then only 20ms.
pull/144/head
luccioman 7 years ago
parent 938d8a9731
commit ac209cac2e

@ -221,7 +221,7 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
/**
* calculated YaCy-Hash of this URI
*
* @note needs DNS lookup to check if the addresses domain is local
* @note needs DNS lookup to check if the addresses domain is local (when the top-level domain part in the host name is unknown)
* that causes that this method may be very slow
*
* @return hash

@ -478,21 +478,28 @@ public class Domains {
"ZW=Zimbabwe",
"YT=Mayotte"
};
private static final String[] TLD_Generic = {
"COM=US Commercial",
private static final String[] TLD_Sponsored = {
"AERO=The air-transport industry",
"ARPA=operationally-critical infrastructural identifier spaces",
"BIZ=Business",
"COOP=cooperative associations",
"INFO=",
"JOBS=human resource managers",
"MOBI=mobile products and services",
"MUSEUM=Museums",
"NAME=Individuals",
"PRO=Credentialed professionals",
"TEL=Published contact data",
"TRAVEL=The travel industry",
"INT=International",
};
private static final String[] TLD_Infrastructure = {
"ARPA=operationally-critical infrastructural identifier spaces",
};
private static final String[] TLD_GenericRestricted = {
"BIZ=Business",
"NAME=Individuals",
"PRO=Credentialed professionals",
};
private static final String[] TLD_OpenNIC = {
// domains from the OpenNIC project, http://www.opennicproject.org, see also http://wiki.opennic.glue/OpenNICNamespaces
"GLUE=OpenNIC Internal Architectural use",
"BBS=OpenNIC Bulletin Board System servers",
@ -658,7 +665,10 @@ public class Domains {
ccSLD_TLD.addAll(Arrays.asList(ccSLD_TLD_list));
}
private static Map<String, Integer> TLDID = new ConcurrentHashMap<String, Integer>(32);
/**
* Map top-level domains (lower caes) to TLD category identifiers.
*/
private static Map<String, Integer> TLDID = new ConcurrentHashMap<String, Integer>();
//private static HashMap<String, String> TLDName = new HashMap<String, String>();
private static void insertTLDProps(final String[] TLDList, final int id) {
@ -697,8 +707,21 @@ public class Domains {
insertTLDProps(TLD_MiddleEastWestAsia, TLD_MiddleEastWestAsia_ID);
insertTLDProps(TLD_NorthAmericaOceania, TLD_NorthAmericaOceania_ID);
insertTLDProps(TLD_Africa, TLD_Africa_ID);
insertTLDProps(TLD_Generic, TLD_Generic_ID);
// the id=7 is used to flag local addresses
for(GenericTLD tld : GenericTLD.values()) {
TLDID.put(tld.getDomainName(), TLD_Generic_ID);
}
/*
* IANA lists the following top-level domains in other catetories than 'generic' but we
* still associate them with YaCy's TLD_Generic_ID otherwise the URLs hash would
* be modified
*/
insertTLDProps(TLD_GenericRestricted, TLD_Generic_ID);
insertTLDProps(TLD_Infrastructure, TLD_Generic_ID);
insertTLDProps(TLD_Sponsored, TLD_Generic_ID);
insertTLDProps(TLD_OpenNIC, TLD_Generic_ID);
// the id=7 (TLD_Local_ID) is used to flag local addresses
}
private static KeyList globalHosts = null;
@ -1113,6 +1136,7 @@ public class Domains {
public static int getDomainID(final String host, final InetAddress hostaddress) {
if (host == null || host.isEmpty()) return TLD_Local_ID;
final int p = host.lastIndexOf('.');
// TODO (must be careful as this would change URL hash generation) : lower case the TLD part before checking its category id, as the TLDID map contains lower cased TLDs as keys */
final String tld = (p > 0) ? host.substring(p + 1) : "";
final Integer i = TLDID.get(tld);
if (i != null) return i.intValue();
@ -1182,6 +1206,7 @@ public class Domains {
// check simply if the tld in the host is a known tld
final int p = host.lastIndexOf('.');
// TODO (must be careful as this would change URL hash generation) : lower case the TLD part before checking its category id, as the TLDID map contains lower cased TLDs as keys */
final String tld = (p > 0) ? host.substring(p + 1) : "";
final Integer i = TLDID.get(tld);
if (i != null) return false;
@ -1195,7 +1220,8 @@ public class Domains {
private static boolean isLocal(final InetAddress a) {
final boolean
localp = noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off
a == null ||
a == null || // TODO returning true here after dns resolution failed can make hash generation inconsistent on some hosts
// (hash is marked with TLD_LOCAL_ID when host name is not found within timeout, but then is marked again with TLD_Generic when the host name is found within timeout on another request)
a.isAnyLocalAddress() ||
a.isLinkLocalAddress() ||
a.isLoopbackAddress() ||
@ -1313,4 +1339,4 @@ public class Domains {
System.out.println("Intranet IP: " + b);
}
}
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,114 @@
// DigestURLHashPerfTest.java
// -----------------------
// part of YaCy
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.cora.document.id;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.output.NullOutputStream;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.util.FileUtils;
/**
* Testing DigestURL hash generation performances
*/
public class DigestURLHashPerfTest {
/**
* Run and measure the {@link DigestURL#hash()} method on a list of urls
* provided in a given file (one URL per line). When an output file path is
* provided, generated hashes are written to it.
*
* @param args
* parameters
* @throws IOException
*/
public static void main(final String[] args) throws IOException {
if (args.length < 1) {
System.out.println("Usage : java DigestURLHashPerfTest <urlsFilePath> [outputFilePath]");
return;
}
final File inFile = new File(args[0]);
final List<String> urls = FileUtils.getListArray(inFile);
System.out.println(urls.size() + " URLs loaded from " + inFile.getAbsolutePath());
try (OutputStream outStream = args.length >= 2 ? new FileOutputStream(args[1]) : new NullOutputStream();
OutputStreamWriter writer = new OutputStreamWriter(outStream, StandardCharsets.UTF_8.name());
BufferedWriter out = new BufferedWriter(writer);) {
if (args.length >= 2) {
System.out.println("Writing URL hashes to " + args[1]);
}
byte[] hash;
DigestURL url;
long beginTime = System.nanoTime(), time, minTime = Long.MAX_VALUE, maxTime = 0, meanTime = 0,
totalTime = 0;
int step = 0;
for (final String urlStr : urls) {
try {
url = new DigestURL(urlStr);
beginTime = System.nanoTime();
hash = url.hash();
time = System.nanoTime() - beginTime;
minTime = Math.min(minTime, time);
maxTime = Math.max(maxTime, time);
totalTime += time;
out.write(ASCII.String(hash));
out.newLine();
step++;
} catch (final MalformedURLException e) {
e.printStackTrace();
}
}
if (step > 0) {
meanTime = totalTime / step;
} else {
meanTime = totalTime;
}
System.out.println("Hash generation total time (ms) : " + TimeUnit.NANOSECONDS.toMillis(totalTime) + " on "
+ step + " urls.");
System.out.println("Render mean time (ms) : " + TimeUnit.NANOSECONDS.toMillis(meanTime));
System.out.println("Render min time (ms) : " + TimeUnit.NANOSECONDS.toMillis(minTime));
System.out.println("Render max time (ms) : " + TimeUnit.NANOSECONDS.toMillis(maxTime));
} finally {
try {
Domains.close();
} finally {
ConcurrentLog.shutdown();
}
}
}
}
Loading…
Cancel
Save