diff --git a/htroot/CrawlResults.html b/htroot/CrawlResults.html index e4081076c..0fd870d24 100644 --- a/htroot/CrawlResults.html +++ b/htroot/CrawlResults.html @@ -79,20 +79,22 @@

Statistics about #[domains]# domains in this stack: - + + + + #{domains}# + + - - - - #{domains}# - - diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index e7be9c9ae..9cf912949 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -24,6 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; @@ -99,16 +100,33 @@ public class CrawlResults { // do the commands if (post.containsKey("clearlist")) sb.crawlResults.clearStack(tabletype); + if (post.containsKey("deleteentry")) { - final String hash = post.get("hash", null); - if (hash != null) { - // delete from database - sb.webIndex.removeURL(hash); + final String hash = post.get("hash", null); + if (hash != null) { + // delete from database + sb.webIndex.removeURL(hash); + } + } + + if (post.containsKey("deletedomain")) { + final String hashpart = post.get("hashpart", null); + final String domain = post.get("domain", null); + if (hashpart != null) { + // delete all urls for this domain from database + try { + sb.webIndex.deleteDomain(hashpart); + sb.crawlResults.deleteDomain(tabletype, domain, hashpart); + } catch (IOException e) { + e.printStackTrace(); } } + } + if (post.containsKey("moreIndexed")) { lines = Integer.parseInt(post.get("showIndexed", "500")); } + if (post.get("si") != null) if (post.get("si").equals("0")) showInit = false; else showInit = true; if (post.get("se") != null) @@ -126,7 +144,7 @@ public class CrawlResults { // create table if (tabletype == 0) { prop.put("table", "2"); - } else if (sb.crawlResults.getStackSize(tabletype) == 0) { + } else if (sb.crawlResults.getStackSize(tabletype) == 0 && sb.crawlResults.getDomainListSize(tabletype) == 0) { prop.put("table", "0"); } else { prop.put("table", "1"); @@ -256,7 +274,10 @@ public class CrawlResults { domain = j.next(); if (domain == null) break; prop.put("table_domains_" + cnt + "_dark", (dark) ? "1" : "0"); + prop.put("table_domains_" + cnt + "_feedbackpage", "CrawlResults.html"); + prop.put("table_domains_" + cnt + "_tabletype", tabletype); prop.put("table_domains_" + cnt + "_domain", domain); + prop.put("table_domains_" + cnt + "_hashpart", yacyURL.hosthash(domain)); prop.put("table_domains_" + cnt + "_count", sb.crawlResults.domainCount(tabletype, domain)); dark = !dark; cnt++; diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index dded77a71..5eb87f8ab 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -26,19 +26,44 @@ - #(urlhashsimilar)#::Sequential List of URL-Hashes:
- #{rows}# - #{cols}##[urlHash]# #{/cols}#
- #{/rows}# - #(/urlhashsimilar)# - - #(lurlexportfinished)#:: -
Finished export of #[urlcount]# URLs to file #[exportfile]#
:: - #(/lurlexportfinished)# + #(statistics)#:: + +
Statistics about top-domains in URL Database +
+
+
Show top domains from all URLs. + +
+
+
+ + #(/statistics)# - #(lurlexporterror)#:: -
Export to file #[exportfile]# failed: #[exportfailmsg]#
:: - #(/lurlexporterror)# + #(statisticslines)#:: +

Statistics about the top-#[domains]# domains in the database: +

+ DomainURLs
- + + +
DomainURLs
#[domain]# #[count]#
+ + + + + + #{domains}# + + + + + + #{/domains}# +
DomainURLs
+
+
+ + + +
+
+
#[domain]##[count]#

+ #(/statisticslines)# #(lurlexport)#::
@@ -69,6 +94,21 @@
Export to file #[exportfile]# is running .. #[urlcount]# URLs so far
:: #(/lurlexport)# + #(lurlexportfinished)#:: +
Finished export of #[urlcount]# URLs to file #[exportfile]#
:: + #(/lurlexportfinished)# + + #(lurlexporterror)#:: +
Export to file #[exportfile]# failed: #[exportfailmsg]#
:: + #(/lurlexporterror)# + + #(urlhashsimilar)#::

Sequential List of URL-Hashes:
+ #{rows}# + #{cols}##[urlHash]# #{/cols}#
+ #{/rows}# +

+ #(/urlhashsimilar)# + #(genUrlProfile)# ::No entry found for URL-hash #[urlhash]# :: @@ -98,6 +138,7 @@  delete the reference to this url at every other word where the reference exists (very extensive, but prevents unresolved references)
#(/genUrlProfile)# + #[result]# #%env/templates/footer.template%# diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index d1a7dcd1b..f87601066 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -54,41 +54,50 @@ public class IndexControlURLs_p { prop.put("result", ""); prop.put("ucount", Integer.toString(sb.webIndex.countURL())); prop.put("otherHosts", ""); + prop.put("genUrlProfile", 0); + prop.put("statistics", 1); + prop.put("statistics_lines", 100); + prop.put("statisticslines", 0); + // show export messages final indexRepositoryReference.Export export = sb.webIndex.exportURL(); if ((export != null) && (export.isAlive())) { // there is currently a running export - prop.put("lurlexportfinished", 0); + prop.put("lurlexport", 2); + prop.put("lurlexportfinished", 0); prop.put("lurlexporterror", 0); - prop.put("lurlexport", 2); - prop.put("lurlexport_exportfile", export.file().toString()); + prop.put("lurlexport_exportfile", export.file().toString()); prop.put("lurlexport_urlcount", export.count()); } else { - prop.put("lurlexport", 1); - prop.put("lurlexport_exportfile", sb.getRootPath() + "/DATA/EXPORT/" + serverDate.formatShortSecond()); - - prop.put("lurlexportfinished", 0); - prop.put("lurlexporterror", 0); - if (export == null) { - // the export is finished, or there has not been a export - prop.put("lurlexportfinished", 1); - prop.put("lurlexportfinished_exportfile", ""); - prop.put("lurlexportfinished_urlcount", 0); - } else { - // the export had errors - prop.put("lurlexporterror", 1); - prop.put("lurlexporterror_exportfile", export.file().toString()); - prop.put("lurlexporterror_exportfailmsg", export.failed()); - } - } + prop.put("lurlexport", 1); + prop.put("lurlexport_exportfile", sb.getRootPath() + "/DATA/EXPORT/" + serverDate.formatShortSecond()); + if (export == null) { + // there has never been an export + prop.put("lurlexportfinished", 0); + prop.put("lurlexporterror", 0); + } else { + // an export was running but has finished + prop.put("lurlexportfinished", 1); + prop.put("lurlexportfinished_exportfile", export.file().toString()); + prop.put("lurlexportfinished_urlcount", export.count()); + if (export.failed() == null) { + prop.put("lurlexporterror", 0); + } else { + prop.put("lurlexporterror", 1); + prop.put("lurlexporterror_exportfile", export.file().toString()); + prop.put("lurlexporterror_exportfailmsg", export.failed()); + } + } + } + if (post == null || env == null) { return prop; // nothing to do } - // default values + // post values that are set on numerous input fields with same name String urlstring = post.get("urlstring", "").trim(); String urlhash = post.get("urlhash", "").trim(); - + if (!urlstring.startsWith("http://") && !urlstring.startsWith("https://")) { urlstring = "http://" + urlstring; } @@ -141,6 +150,7 @@ public class IndexControlURLs_p { prop.put("urlhash", ""); } else { prop.putAll(genUrlProfile(sb, entry, urlhash)); + prop.put("statistics", 0); } } catch (final MalformedURLException e) { prop.putHTML("urlstring", "bad url: " + urlstring); @@ -156,6 +166,7 @@ public class IndexControlURLs_p { } else { prop.putHTML("urlstring", entry.comp().url().toNormalform(false, true)); prop.putAll(genUrlProfile(sb, entry, urlhash)); + prop.put("statistics", 0); } prop.put("lurlexport", 0); } @@ -181,6 +192,7 @@ public class IndexControlURLs_p { } i++; } + prop.put("statistics", 0); prop.put("urlhashsimilar_rows", rows); prop.put("result", result.toString()); } catch (final IOException e) { @@ -217,6 +229,45 @@ public class IndexControlURLs_p { } } + if (post.containsKey("deletedomain")) { + String hp = post.get("hashpart"); + try { + sb.webIndex.deleteDomain(hp); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + // trigger the loading of the table + post.put("statistics", ""); + } + + if (post.containsKey("statistics")) { + int count = post.getInt("lines", 100); + Iterator statsiter; + prop.put("statistics_lines", count); + int cnt = 0; + try { + statsiter = sb.webIndex.statistics(count); + boolean dark = true; + indexRepositoryReference.hostStat hs; + while (statsiter.hasNext() && cnt < count) { + hs = statsiter.next(); + prop.put("statisticslines_domains_" + cnt + "_dark", (dark) ? "1" : "0"); + prop.put("statisticslines_domains_" + cnt + "_domain", hs.hostname); + prop.put("statisticslines_domains_" + cnt + "lines", count); + prop.put("statisticslines_domains_" + cnt + "_hashpart", hs.hosthash); + prop.put("statisticslines_domains_" + cnt + "_count", hs.count); + dark = !dark; + cnt++; + } + } catch (IOException e) { + e.printStackTrace(); + } + prop.put("statisticslines_domains", cnt); + prop.put("statisticslines", 1); + prop.put("lurlexport", 0); + } + // insert constants prop.putNum("ucount", sb.webIndex.countURL()); // return rewrite properties diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java index 02ab999ae..65f423157 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/FTPLoader.java @@ -193,7 +193,7 @@ public class FTPLoader { * establish a connection to the ftp server (open, login, set transfer mode) * * @param ftpClient - * @param host + * @param hostname * @param port * @return success */ diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index 5c22ec737..9f37f4910 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -108,11 +108,15 @@ public final class ResultURLs { public synchronized int getStackSize(final int stack) { final List resultStack = getStack(stack); - if(resultStack == null) { - return -1; - } + if (resultStack == null) return 0; return resultStack.size(); } + + public synchronized int getDomainListSize(final int stack) { + final kelondroMScoreCluster domains = getDomains(stack); + if (domains == null) return 0; + return domains.size(); + } public synchronized String getUrlHash(final int stack, final int pos) { return getHashNo(stack, pos, 0); @@ -191,9 +195,21 @@ public final class ResultURLs { * @return iterator of domains in reverse order (downwards) */ public Iterator domains(final int stack) { + assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; return getDomains(stack).scores(false); } + public int deleteDomain(final int stack, String host, String hosthash) { + assert hosthash.length() == 5; + int i = 0; + while (i < getStackSize(stack)) { + if (getUrlHash(stack, i).substring(6, 11).equals(hosthash)) getStack(stack).remove(i); else i++; + } + assert host != null : "host = null"; + assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; + return getDomains(stack).deleteScore(host); + } + /** * return the count of the domain * @param stack type @@ -201,6 +217,8 @@ public final class ResultURLs { * @return the number of occurrences of the domain in the stack statistics */ public int domainCount(final int stack, String domain) { + assert domain != null : "domain = null"; + assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; return getDomains(stack).getScore(domain); } @@ -247,7 +265,7 @@ public final class ResultURLs { public synchronized boolean removeStack(final int stack, final int pos) { final List resultStack = getStack(stack); - if(resultStack == null) { + if (resultStack == null) { return false; } return resultStack.remove(pos) != null; @@ -257,8 +275,11 @@ public final class ResultURLs { final List resultStack = getStack(stack); if (resultStack != null) resultStack.clear(); final kelondroMScoreCluster resultDomains = getDomains(stack); - if (resultDomains != null) resultDomains.clear(); - + if (resultDomains != null) { + // we do not clear this completely, just remove most of the less important entries + resultDomains.shrinkToMaxSize(100); + resultDomains.shrinkToMinScore(2); + } } public synchronized boolean remove(final String urlHash) { diff --git a/source/de/anomic/index/indexRepositoryReference.java b/source/de/anomic/index/indexRepositoryReference.java index fc379d01f..1c499d0f5 100644 --- a/source/de/anomic/index/indexRepositoryReference.java +++ b/source/de/anomic/index/indexRepositoryReference.java @@ -31,9 +31,12 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; +import java.util.ArrayList; import java.util.Date; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.Map; import de.anomic.data.htmlTools; import de.anomic.http.JakartaCommonsHttpClient; @@ -42,12 +45,11 @@ import de.anomic.http.httpRemoteProxyConfig; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCloneableIterator; -import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroIndex; +import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRowSet; import de.anomic.kelondro.kelondroSplitTable; -import de.anomic.server.serverCodings; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyURL; @@ -55,8 +57,9 @@ public final class indexRepositoryReference { // class objects kelondroIndex urlIndexFile; - private Export exportthread = null; // will habe a export thread assigned if exporter is running - private File location = null; + private Export exportthread = null; // will have a export thread assigned if exporter is running + private File location = null; + ArrayList statsDump = null; public indexRepositoryReference(final File indexSecondaryPath) { super(); @@ -66,11 +69,13 @@ public final class indexRepositoryReference { public void clearCache() { if (urlIndexFile instanceof kelondroCache) ((kelondroCache) urlIndexFile).clearCache(); + statsDump = null; } public void clear() throws IOException { if (exportthread != null) exportthread.interrupt(); urlIndexFile.clear(); + statsDump = null; } public int size() { @@ -78,8 +83,9 @@ public final class indexRepositoryReference { } public void close() { + statsDump = null; if (urlIndexFile != null) { - urlIndexFile.close(); + urlIndexFile.close(); urlIndexFile = null; } } @@ -127,24 +133,14 @@ public final class indexRepositoryReference { } urlIndexFile.put(entry.toRowEntry(), new Date() /*entry.loaddate()*/); - } - - public synchronized indexURLReference newEntry(final String propStr) { - if (propStr == null || !propStr.startsWith("{") || !propStr.endsWith("}")) { - return null; - } - try { - return new indexURLReference(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); - } catch (final kelondroException e) { - // wrong format - return null; - } + statsDump = null; } public synchronized boolean remove(final String urlHash) { if (urlHash == null) return false; try { final kelondroRow.Entry r = urlIndexFile.remove(urlHash.getBytes()); + if (r != null) statsDump = null; return r != null; } catch (final IOException e) { return false; @@ -504,4 +500,110 @@ public final class indexRepositoryReference { } } + + public Iterator statistics(int count) throws IOException { + // prevent too heavy IO. + if (statsDump != null && count <= statsDump.size()) return statsDump.iterator(); + + HashMap map = new HashMap(); + // first collect all domains and calculate statistics about it + kelondroCloneableIterator i = this.urlIndexFile.keys(true, null); + String urlhash, hosthash; + hashStat ds; + if (i != null) while (i.hasNext()) { + urlhash = new String(i.next()); + hosthash = urlhash.substring(6,11); + ds = map.get(hosthash); + if (ds == null) { + ds = new hashStat(urlhash); + map.put(hosthash, ds); + } else { + ds.count++; + } + } + + // order elements by size + kelondroMScoreCluster s = new kelondroMScoreCluster(); + for (Map.Entry e: map.entrySet()) { + s.addScore(e.getValue().urlhash, e.getValue().count); + } + + // fetch urls from the database to determine the host in clear text + Iterator j = s.scores(false); // iterate urlhash-examples in reverse order (biggest first) + indexURLReference urlref; + count += 10; // make some more to prevent that we have to do this again after deletions too soon. + if (count < 0 || count > s.size()) count = s.size(); + statsDump = new ArrayList(); + while (j.hasNext() && count > 0) { + urlhash = j.next(); + if (urlhash == null) continue; + urlref = this.load(urlhash, null, 0); + if (urlref == null || urlref.comp() == null || urlref.comp().url() == null || urlref.comp().url().getHost() == null) continue; + if (statsDump == null) return new ArrayList().iterator(); // some other operation has destroyed the object + statsDump.add(new hostStat(urlref.comp().url().getHost(), urlhash.substring(6, 11), s.getScore(urlhash))); + count--; + } + // finally return an iterator for the result array + return (statsDump == null) ? new ArrayList().iterator() : statsDump.iterator(); + } + + public class hashStat { + public String urlhash; + public int count; + public hashStat(String urlhash) { + this.urlhash = urlhash; + this.count = 1; + } + } + + public class hostStat { + public String hostname, hosthash; + public int count; + public hostStat(String host, String urlhashfragment, int count) { + assert urlhashfragment.length() == 5; + this.hostname = host; + this.hosthash = urlhashfragment; + this.count = count; + } + } + + /** + * using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain + * here such a fragment can be used to delete all these domains at once + * @param hosthash + * @return number of deleted domains + * @throws IOException + */ + public int deleteDomain(String hosthash) throws IOException { + // first collect all url hashes that belong to the domain + assert hosthash.length() == 5; + ArrayList l = new ArrayList(); + kelondroCloneableIterator i = this.urlIndexFile.keys(true, null); + String hash; + while (i.hasNext()) { + hash = new String(i.next()); + if (hosthash.equals(hash.substring(6, 11))) l.add(hash); + } + + // then delete the urls using this list + int cnt = 0; + for (String h: l) { + if (urlIndexFile.remove(h.getBytes()) != null) cnt++; + } + + // finally remove the line with statistics + if (statsDump != null) { + Iterator hsi = statsDump.iterator(); + hostStat hs; + while (hsi.hasNext()) { + hs = hsi.next(); + if (hs.hosthash.equals(hosthash)) { + hsi.remove(); + break; + } + } + } + + return cnt; + } } diff --git a/source/de/anomic/kelondro/kelondroMScoreCluster.java b/source/de/anomic/kelondro/kelondroMScoreCluster.java index dfca15cce..a31176b60 100644 --- a/source/de/anomic/kelondro/kelondroMScoreCluster.java +++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java @@ -50,6 +50,34 @@ public final class kelondroMScoreCluster { encnt = 0; } + /** + * shrink the cluster to a demanded size + * @param maxsize + */ + public void shrinkToMaxSize(int maxsize) { + if (maxsize < 0) return; + while (refkeyDB.size() > maxsize) { + // find and remove smallest objects until cluster has demanded size + refkeyDB.remove(keyrefDB.remove(keyrefDB.firstKey())); + } + } + + /** + * shrink the cluster in such a way that the smallest score is equal or greater than a given minScore + * @param minScore + */ + public void shrinkToMinScore(int minScore) { + int score; + Long key; + while (true) { + // find and remove objects where their score is smaller than the demanded minimum score + key = keyrefDB.firstKey(); + score = (int) ((key.longValue() & 0xFFFFFFFF00000000L) >> 32); + if (score >= minScore) break; + refkeyDB.remove(keyrefDB.remove(key)); + } + } + public static final String shortDateFormatString = "yyyyMMddHHmmss"; public static final SimpleDateFormat shortFormatter = new SimpleDateFormat(shortDateFormatString); public static final long minutemillis = 60000; @@ -261,13 +289,11 @@ public final class kelondroMScoreCluster { public synchronized E getMaxObject() { if (refkeyDB.size() == 0) return null; - //return getScores(1, false)[0]; return keyrefDB.get(keyrefDB.lastKey()); } public synchronized E getMinObject() { if (refkeyDB.size() == 0) return null; - //return getScores(1, true)[0]; return keyrefDB.get(keyrefDB.firstKey()); } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 2e70fc4f8..22df773bf 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -100,18 +100,18 @@ public final class plasmaWordIndex implements indexRI { private final kelondroByteOrder indexOrder = kelondroBase64Order.enhancedCoder; private final indexRAMRI dhtOutCache, dhtInCache; private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster - private final serverLog log; - indexRepositoryReference referenceURL; + private final serverLog log; + private indexRepositoryReference referenceURL; public final yacySeedDB seedDB; public yacyNewsPool newsPool; - private final File primaryRoot, secondaryRoot; + private final File primaryRoot, secondaryRoot; public IndexingStack queuePreStack; public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls; public CrawlProfile.entry defaultProxyProfile; public CrawlProfile.entry defaultRemoteProfile; public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; - private final File queuesRoot; + private final File queuesRoot; public yacyPeerActions peerActions; public plasmaWordIndex(final String networkName, final serverLog log, final File indexPrimaryRoot, final File indexSecondaryRoot, final int entityCacheMaxSize) { @@ -362,6 +362,14 @@ public final class plasmaWordIndex implements indexRI { return this.referenceURL.entries(up, firstHash); } + public Iterator statistics(int count) throws IOException { + return this.referenceURL.statistics(count); + } + + public int deleteDomain(String urlfragment) throws IOException { + return this.referenceURL.deleteDomain(urlfragment); + } + public indexRepositoryReference.BlacklistCleaner getURLCleaner(final indexReferenceBlacklist blacklist) { return this.referenceURL.getBlacklistCleaner(blacklist); // thread is not already started after this is called! } diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index 3bc3f26b9..2b5fee153 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -737,13 +737,13 @@ public class yacyURL implements Serializable { hash.append(kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(toNormalform(true, true))).substring(0, 5)); // 5 chars hash.append(subdomPortPath(subdom, port, rootpath)); // 1 char // form the 'global' part of the hash - hash.append(protocolHostPort(this.protocol, host, port)); // 5 chars + hash.append(hosthash(this.protocol, host, port)); // 5 chars hash.append(kelondroBase64Order.enhancedCoder.encodeByte(flagbyte)); // 1 char // return result hash return hash.toString(); } - + private static char subdomPortPath(final String subdom, final int port, final String rootpath) { return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0); } @@ -755,10 +755,25 @@ public class yacyURL implements Serializable { return (urlHash.charAt(5) == rootURLFlag0) || (urlHash.charAt(5) == rootURLFlag1); } - private static String protocolHostPort(final String protocol, final String host, final int port) { + /** + * compute a 5-byte hash fragment that can be used to identify the domain of the url + * @param protocol + * @param host + * @param port + * @return 5 bytes base64 encoded String representing the domain of the url + */ + public static final String hosthash(final String protocol, final String host, final int port) { return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5); } + public static final String hosthash(final String host) { + return hosthash("http", host, 80); + } + + public final String hosthash() { + return this.hash().substring(6, 11); + } + private static String[] testTLDs = new String[] { "com", "net", "org", "uk", "fr", "de", "es", "it" }; public static final yacyURL probablyWordURL(final String urlHash, final TreeSet words) { @@ -769,7 +784,7 @@ public class yacyURL implements Serializable { if ((word == null) || (word.length() == 0)) continue; final String pattern = urlHash.substring(6, 11); for (int i = 0; i < testTLDs.length; i++) { - if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80))) + if (pattern.equals(hosthash("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80))) try { return new yacyURL("http://www." + word.toLowerCase() + "." + testTLDs[i], null); } catch (final MalformedURLException e) {