- added domain statistic generation to IndexControlURLs_p.html servlet

- added 'delete all' button to all results of such a domain statistic output which causes that all urls to this domain are deleted
- extended stack cleaner to clean also the statistics: they are not completely destroyed, only the smallest counting domains are removed


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5117 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 44bc8311af
commit 77ee0765a4

@ -79,20 +79,22 @@
<p><em>Statistics about #[domains]# domains in this stack:</em>
<table cellpadding="2" cellspacing="1" >
<tr class="TableHeader">
<td align="center">
<td align="center"></td>
<td><strong>Domain</strong></td>
<td><strong>URLs</strong></td>
</tr>
#{domains}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td align="center">
<form action="#[feedbackpage]#" method="post" enctype="multipart/form-data">
<div>
<input type="hidden" name="process" value="#[tabletype]#" />
<input type="submit" name="clearlist" value="clear list" />
<input type="hidden" name="hashpart" value="#[hashpart]#" />
<input type="hidden" name="domain" value="#[domain]#" />
<input type="submit" name="deletedomain" value="delete all" />
</div>
</form>
</td>
<td><strong>Domain</strong></td>
<td><strong>URLs</strong></td>
</tr>
#{domains}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td></td>
<td><a href="http://#[domain]#/" target="_">#[domain]#</a></td>
<td>#[count]#</td>
</tr>

@ -24,6 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
@ -99,16 +100,33 @@ public class CrawlResults {
// do the commands
if (post.containsKey("clearlist")) sb.crawlResults.clearStack(tabletype);
if (post.containsKey("deleteentry")) {
final String hash = post.get("hash", null);
if (hash != null) {
// delete from database
sb.webIndex.removeURL(hash);
final String hash = post.get("hash", null);
if (hash != null) {
// delete from database
sb.webIndex.removeURL(hash);
}
}
if (post.containsKey("deletedomain")) {
final String hashpart = post.get("hashpart", null);
final String domain = post.get("domain", null);
if (hashpart != null) {
// delete all urls for this domain from database
try {
sb.webIndex.deleteDomain(hashpart);
sb.crawlResults.deleteDomain(tabletype, domain, hashpart);
} catch (IOException e) {
e.printStackTrace();
}
}
}
if (post.containsKey("moreIndexed")) {
lines = Integer.parseInt(post.get("showIndexed", "500"));
}
if (post.get("si") != null)
if (post.get("si").equals("0")) showInit = false; else showInit = true;
if (post.get("se") != null)
@ -126,7 +144,7 @@ public class CrawlResults {
// create table
if (tabletype == 0) {
prop.put("table", "2");
} else if (sb.crawlResults.getStackSize(tabletype) == 0) {
} else if (sb.crawlResults.getStackSize(tabletype) == 0 && sb.crawlResults.getDomainListSize(tabletype) == 0) {
prop.put("table", "0");
} else {
prop.put("table", "1");
@ -256,7 +274,10 @@ public class CrawlResults {
domain = j.next();
if (domain == null) break;
prop.put("table_domains_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("table_domains_" + cnt + "_feedbackpage", "CrawlResults.html");
prop.put("table_domains_" + cnt + "_tabletype", tabletype);
prop.put("table_domains_" + cnt + "_domain", domain);
prop.put("table_domains_" + cnt + "_hashpart", yacyURL.hosthash(domain));
prop.put("table_domains_" + cnt + "_count", sb.crawlResults.domainCount(tabletype, domain));
dark = !dark;
cnt++;

@ -26,19 +26,44 @@
</fieldset>
</form>
#(urlhashsimilar)#::Sequential List of URL-Hashes:<br />
#{rows}#
#{cols}#<a href="/IndexControlURLs_p.html?urlhash=#[urlHash]#&amp;urlhashsearch=1" class="tt">#[urlHash]#</a> #{/cols}#<br />
#{/rows}#
#(/urlhashsimilar)#
#(lurlexportfinished)#::
<div class="commit">Finished export of #[urlcount]# URLs to file #[exportfile]#</div>::
#(/lurlexportfinished)#
#(statistics)#::
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>Statistics about top-domains in URL Database</legend>
<dl>
<dt class="TableCellLight"></dt>
<dd>Show top <input type="text" name="lines" value="#[lines]#" size="6" maxlength="6" /> domains from all URLs.
<input type="submit" name="statistics" value="Generate Statistics" />
</dd>
</dl>
</fieldset>
</form>
#(/statistics)#
#(lurlexporterror)#::
<div class="error">Export to file #[exportfile]# failed: #[exportfailmsg]#</div>::
#(/lurlexporterror)#
#(statisticslines)#::
<p><em>Statistics about the top-#[domains]# domains in the database:</em>
<table cellpadding="2" cellspacing="1" >
<tr class="TableHeader">
<td align="center"></td>
<td><strong>Domain</strong></td>
<td><strong>URLs</strong></td>
</tr>
#{domains}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data">
<div>
<input type="hidden" name="hashpart" value="#[hashpart]#" />
<input type="hidden" name="lines" value="#[lines]#" />
<input type="submit" name="deletedomain" value="delete all" />
</div>
</form>
</td>
<td><a href="http://#[domain]#/" target="_">#[domain]#</a></td>
<td>#[count]#</td>
</tr>
#{/domains}#
</table></p>
#(/statisticslines)#
#(lurlexport)#::
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data">
@ -69,6 +94,21 @@
<div class="commit" style="text-decoration:blink">Export to file #[exportfile]# is running .. #[urlcount]# URLs so far</div>::
#(/lurlexport)#
#(lurlexportfinished)#::
<div class="commit">Finished export of #[urlcount]# URLs to file #[exportfile]#</div>::
#(/lurlexportfinished)#
#(lurlexporterror)#::
<div class="error">Export to file #[exportfile]# failed: #[exportfailmsg]#</div>::
#(/lurlexporterror)#
#(urlhashsimilar)#::<p>Sequential List of URL-Hashes:<br />
#{rows}#
#{cols}#<a href="/IndexControlURLs_p.html?urlhash=#[urlHash]#&amp;urlhashsearch=1" class="tt">#[urlHash]#</a> #{/cols}#<br />
#{/rows}#
</p>
#(/urlhashsimilar)#
#(genUrlProfile)#
::No entry found for URL-hash #[urlhash]#
::<table>
@ -98,6 +138,7 @@
<span class="small">&nbsp;delete the reference to this url at every other word where the reference exists (very extensive, but prevents unresolved references)</span><br />
</form>
#(/genUrlProfile)#
#[result]#
#%env/templates/footer.template%#

@ -54,41 +54,50 @@ public class IndexControlURLs_p {
prop.put("result", "");
prop.put("ucount", Integer.toString(sb.webIndex.countURL()));
prop.put("otherHosts", "");
prop.put("genUrlProfile", 0);
prop.put("statistics", 1);
prop.put("statistics_lines", 100);
prop.put("statisticslines", 0);
// show export messages
final indexRepositoryReference.Export export = sb.webIndex.exportURL();
if ((export != null) && (export.isAlive())) {
// there is currently a running export
prop.put("lurlexportfinished", 0);
prop.put("lurlexport", 2);
prop.put("lurlexportfinished", 0);
prop.put("lurlexporterror", 0);
prop.put("lurlexport", 2);
prop.put("lurlexport_exportfile", export.file().toString());
prop.put("lurlexport_exportfile", export.file().toString());
prop.put("lurlexport_urlcount", export.count());
} else {
prop.put("lurlexport", 1);
prop.put("lurlexport_exportfile", sb.getRootPath() + "/DATA/EXPORT/" + serverDate.formatShortSecond());
prop.put("lurlexportfinished", 0);
prop.put("lurlexporterror", 0);
if (export == null) {
// the export is finished, or there has not been a export
prop.put("lurlexportfinished", 1);
prop.put("lurlexportfinished_exportfile", "");
prop.put("lurlexportfinished_urlcount", 0);
} else {
// the export had errors
prop.put("lurlexporterror", 1);
prop.put("lurlexporterror_exportfile", export.file().toString());
prop.put("lurlexporterror_exportfailmsg", export.failed());
}
}
prop.put("lurlexport", 1);
prop.put("lurlexport_exportfile", sb.getRootPath() + "/DATA/EXPORT/" + serverDate.formatShortSecond());
if (export == null) {
// there has never been an export
prop.put("lurlexportfinished", 0);
prop.put("lurlexporterror", 0);
} else {
// an export was running but has finished
prop.put("lurlexportfinished", 1);
prop.put("lurlexportfinished_exportfile", export.file().toString());
prop.put("lurlexportfinished_urlcount", export.count());
if (export.failed() == null) {
prop.put("lurlexporterror", 0);
} else {
prop.put("lurlexporterror", 1);
prop.put("lurlexporterror_exportfile", export.file().toString());
prop.put("lurlexporterror_exportfailmsg", export.failed());
}
}
}
if (post == null || env == null) {
return prop; // nothing to do
}
// default values
// post values that are set on numerous input fields with same name
String urlstring = post.get("urlstring", "").trim();
String urlhash = post.get("urlhash", "").trim();
if (!urlstring.startsWith("http://") &&
!urlstring.startsWith("https://")) { urlstring = "http://" + urlstring; }
@ -141,6 +150,7 @@ public class IndexControlURLs_p {
prop.put("urlhash", "");
} else {
prop.putAll(genUrlProfile(sb, entry, urlhash));
prop.put("statistics", 0);
}
} catch (final MalformedURLException e) {
prop.putHTML("urlstring", "bad url: " + urlstring);
@ -156,6 +166,7 @@ public class IndexControlURLs_p {
} else {
prop.putHTML("urlstring", entry.comp().url().toNormalform(false, true));
prop.putAll(genUrlProfile(sb, entry, urlhash));
prop.put("statistics", 0);
}
prop.put("lurlexport", 0);
}
@ -181,6 +192,7 @@ public class IndexControlURLs_p {
}
i++;
}
prop.put("statistics", 0);
prop.put("urlhashsimilar_rows", rows);
prop.put("result", result.toString());
} catch (final IOException e) {
@ -217,6 +229,45 @@ public class IndexControlURLs_p {
}
}
if (post.containsKey("deletedomain")) {
String hp = post.get("hashpart");
try {
sb.webIndex.deleteDomain(hp);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// trigger the loading of the table
post.put("statistics", "");
}
if (post.containsKey("statistics")) {
int count = post.getInt("lines", 100);
Iterator<indexRepositoryReference.hostStat> statsiter;
prop.put("statistics_lines", count);
int cnt = 0;
try {
statsiter = sb.webIndex.statistics(count);
boolean dark = true;
indexRepositoryReference.hostStat hs;
while (statsiter.hasNext() && cnt < count) {
hs = statsiter.next();
prop.put("statisticslines_domains_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("statisticslines_domains_" + cnt + "_domain", hs.hostname);
prop.put("statisticslines_domains_" + cnt + "lines", count);
prop.put("statisticslines_domains_" + cnt + "_hashpart", hs.hosthash);
prop.put("statisticslines_domains_" + cnt + "_count", hs.count);
dark = !dark;
cnt++;
}
} catch (IOException e) {
e.printStackTrace();
}
prop.put("statisticslines_domains", cnt);
prop.put("statisticslines", 1);
prop.put("lurlexport", 0);
}
// insert constants
prop.putNum("ucount", sb.webIndex.countURL());
// return rewrite properties

@ -193,7 +193,7 @@ public class FTPLoader {
* establish a connection to the ftp server (open, login, set transfer mode)
*
* @param ftpClient
* @param host
* @param hostname
* @param port
* @return success
*/

@ -108,11 +108,15 @@ public final class ResultURLs {
public synchronized int getStackSize(final int stack) {
final List<String> resultStack = getStack(stack);
if(resultStack == null) {
return -1;
}
if (resultStack == null) return 0;
return resultStack.size();
}
public synchronized int getDomainListSize(final int stack) {
final kelondroMScoreCluster<String> domains = getDomains(stack);
if (domains == null) return 0;
return domains.size();
}
public synchronized String getUrlHash(final int stack, final int pos) {
return getHashNo(stack, pos, 0);
@ -191,9 +195,21 @@ public final class ResultURLs {
* @return iterator of domains in reverse order (downwards)
*/
public Iterator<String> domains(final int stack) {
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).scores(false);
}
public int deleteDomain(final int stack, String host, String hosthash) {
assert hosthash.length() == 5;
int i = 0;
while (i < getStackSize(stack)) {
if (getUrlHash(stack, i).substring(6, 11).equals(hosthash)) getStack(stack).remove(i); else i++;
}
assert host != null : "host = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).deleteScore(host);
}
/**
* return the count of the domain
* @param stack type
@ -201,6 +217,8 @@ public final class ResultURLs {
* @return the number of occurrences of the domain in the stack statistics
*/
public int domainCount(final int stack, String domain) {
assert domain != null : "domain = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).getScore(domain);
}
@ -247,7 +265,7 @@ public final class ResultURLs {
public synchronized boolean removeStack(final int stack, final int pos) {
final List<String> resultStack = getStack(stack);
if(resultStack == null) {
if (resultStack == null) {
return false;
}
return resultStack.remove(pos) != null;
@ -257,8 +275,11 @@ public final class ResultURLs {
final List<String> resultStack = getStack(stack);
if (resultStack != null) resultStack.clear();
final kelondroMScoreCluster<String> resultDomains = getDomains(stack);
if (resultDomains != null) resultDomains.clear();
if (resultDomains != null) {
// we do not clear this completely, just remove most of the less important entries
resultDomains.shrinkToMaxSize(100);
resultDomains.shrinkToMinScore(2);
}
}
public synchronized boolean remove(final String urlHash) {

@ -31,9 +31,12 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import de.anomic.data.htmlTools;
import de.anomic.http.JakartaCommonsHttpClient;
@ -42,12 +45,11 @@ import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroCloneableIterator;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.kelondro.kelondroSplitTable;
import de.anomic.server.serverCodings;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
@ -55,8 +57,9 @@ public final class indexRepositoryReference {
// class objects
kelondroIndex urlIndexFile;
private Export exportthread = null; // will habe a export thread assigned if exporter is running
private File location = null;
private Export exportthread = null; // will have a export thread assigned if exporter is running
private File location = null;
ArrayList<hostStat> statsDump = null;
public indexRepositoryReference(final File indexSecondaryPath) {
super();
@ -66,11 +69,13 @@ public final class indexRepositoryReference {
public void clearCache() {
if (urlIndexFile instanceof kelondroCache) ((kelondroCache) urlIndexFile).clearCache();
statsDump = null;
}
public void clear() throws IOException {
if (exportthread != null) exportthread.interrupt();
urlIndexFile.clear();
statsDump = null;
}
public int size() {
@ -78,8 +83,9 @@ public final class indexRepositoryReference {
}
public void close() {
statsDump = null;
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile.close();
urlIndexFile = null;
}
}
@ -127,24 +133,14 @@ public final class indexRepositoryReference {
}
urlIndexFile.put(entry.toRowEntry(), new Date() /*entry.loaddate()*/);
}
public synchronized indexURLReference newEntry(final String propStr) {
if (propStr == null || !propStr.startsWith("{") || !propStr.endsWith("}")) {
return null;
}
try {
return new indexURLReference(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
} catch (final kelondroException e) {
// wrong format
return null;
}
statsDump = null;
}
public synchronized boolean remove(final String urlHash) {
if (urlHash == null) return false;
try {
final kelondroRow.Entry r = urlIndexFile.remove(urlHash.getBytes());
if (r != null) statsDump = null;
return r != null;
} catch (final IOException e) {
return false;
@ -504,4 +500,110 @@ public final class indexRepositoryReference {
}
}
public Iterator<hostStat> statistics(int count) throws IOException {
// prevent too heavy IO.
if (statsDump != null && count <= statsDump.size()) return statsDump.iterator();
HashMap<String, hashStat> map = new HashMap<String, hashStat>();
// first collect all domains and calculate statistics about it
kelondroCloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
String urlhash, hosthash;
hashStat ds;
if (i != null) while (i.hasNext()) {
urlhash = new String(i.next());
hosthash = urlhash.substring(6,11);
ds = map.get(hosthash);
if (ds == null) {
ds = new hashStat(urlhash);
map.put(hosthash, ds);
} else {
ds.count++;
}
}
// order elements by size
kelondroMScoreCluster<String> s = new kelondroMScoreCluster<String>();
for (Map.Entry<String, hashStat> e: map.entrySet()) {
s.addScore(e.getValue().urlhash, e.getValue().count);
}
// fetch urls from the database to determine the host in clear text
Iterator<String> j = s.scores(false); // iterate urlhash-examples in reverse order (biggest first)
indexURLReference urlref;
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
if (count < 0 || count > s.size()) count = s.size();
statsDump = new ArrayList<hostStat>();
while (j.hasNext() && count > 0) {
urlhash = j.next();
if (urlhash == null) continue;
urlref = this.load(urlhash, null, 0);
if (urlref == null || urlref.comp() == null || urlref.comp().url() == null || urlref.comp().url().getHost() == null) continue;
if (statsDump == null) return new ArrayList<hostStat>().iterator(); // some other operation has destroyed the object
statsDump.add(new hostStat(urlref.comp().url().getHost(), urlhash.substring(6, 11), s.getScore(urlhash)));
count--;
}
// finally return an iterator for the result array
return (statsDump == null) ? new ArrayList<hostStat>().iterator() : statsDump.iterator();
}
public class hashStat {
public String urlhash;
public int count;
public hashStat(String urlhash) {
this.urlhash = urlhash;
this.count = 1;
}
}
public class hostStat {
public String hostname, hosthash;
public int count;
public hostStat(String host, String urlhashfragment, int count) {
assert urlhashfragment.length() == 5;
this.hostname = host;
this.hosthash = urlhashfragment;
this.count = count;
}
}
/**
* using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain
* here such a fragment can be used to delete all these domains at once
* @param hosthash
* @return number of deleted domains
* @throws IOException
*/
public int deleteDomain(String hosthash) throws IOException {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 5;
ArrayList<String> l = new ArrayList<String>();
kelondroCloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
String hash;
while (i.hasNext()) {
hash = new String(i.next());
if (hosthash.equals(hash.substring(6, 11))) l.add(hash);
}
// then delete the urls using this list
int cnt = 0;
for (String h: l) {
if (urlIndexFile.remove(h.getBytes()) != null) cnt++;
}
// finally remove the line with statistics
if (statsDump != null) {
Iterator<hostStat> hsi = statsDump.iterator();
hostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hs.hosthash.equals(hosthash)) {
hsi.remove();
break;
}
}
}
return cnt;
}
}

@ -50,6 +50,34 @@ public final class kelondroMScoreCluster<E> {
encnt = 0;
}
/**
* shrink the cluster to a demanded size
* @param maxsize
*/
public void shrinkToMaxSize(int maxsize) {
if (maxsize < 0) return;
while (refkeyDB.size() > maxsize) {
// find and remove smallest objects until cluster has demanded size
refkeyDB.remove(keyrefDB.remove(keyrefDB.firstKey()));
}
}
/**
* shrink the cluster in such a way that the smallest score is equal or greater than a given minScore
* @param minScore
*/
public void shrinkToMinScore(int minScore) {
int score;
Long key;
while (true) {
// find and remove objects where their score is smaller than the demanded minimum score
key = keyrefDB.firstKey();
score = (int) ((key.longValue() & 0xFFFFFFFF00000000L) >> 32);
if (score >= minScore) break;
refkeyDB.remove(keyrefDB.remove(key));
}
}
public static final String shortDateFormatString = "yyyyMMddHHmmss";
public static final SimpleDateFormat shortFormatter = new SimpleDateFormat(shortDateFormatString);
public static final long minutemillis = 60000;
@ -261,13 +289,11 @@ public final class kelondroMScoreCluster<E> {
public synchronized E getMaxObject() {
if (refkeyDB.size() == 0) return null;
//return getScores(1, false)[0];
return keyrefDB.get(keyrefDB.lastKey());
}
public synchronized E getMinObject() {
if (refkeyDB.size() == 0) return null;
//return getScores(1, true)[0];
return keyrefDB.get(keyrefDB.firstKey());
}

@ -100,18 +100,18 @@ public final class plasmaWordIndex implements indexRI {
private final kelondroByteOrder indexOrder = kelondroBase64Order.enhancedCoder;
private final indexRAMRI dhtOutCache, dhtInCache;
private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
private final serverLog log;
indexRepositoryReference referenceURL;
private final serverLog log;
private indexRepositoryReference referenceURL;
public final yacySeedDB seedDB;
public yacyNewsPool newsPool;
private final File primaryRoot, secondaryRoot;
private final File primaryRoot, secondaryRoot;
public IndexingStack queuePreStack;
public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls;
public CrawlProfile.entry defaultProxyProfile;
public CrawlProfile.entry defaultRemoteProfile;
public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
private final File queuesRoot;
private final File queuesRoot;
public yacyPeerActions peerActions;
public plasmaWordIndex(final String networkName, final serverLog log, final File indexPrimaryRoot, final File indexSecondaryRoot, final int entityCacheMaxSize) {
@ -362,6 +362,14 @@ public final class plasmaWordIndex implements indexRI {
return this.referenceURL.entries(up, firstHash);
}
public Iterator<indexRepositoryReference.hostStat> statistics(int count) throws IOException {
return this.referenceURL.statistics(count);
}
public int deleteDomain(String urlfragment) throws IOException {
return this.referenceURL.deleteDomain(urlfragment);
}
public indexRepositoryReference.BlacklistCleaner getURLCleaner(final indexReferenceBlacklist blacklist) {
return this.referenceURL.getBlacklistCleaner(blacklist); // thread is not already started after this is called!
}

@ -737,13 +737,13 @@ public class yacyURL implements Serializable {
hash.append(kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(toNormalform(true, true))).substring(0, 5)); // 5 chars
hash.append(subdomPortPath(subdom, port, rootpath)); // 1 char
// form the 'global' part of the hash
hash.append(protocolHostPort(this.protocol, host, port)); // 5 chars
hash.append(hosthash(this.protocol, host, port)); // 5 chars
hash.append(kelondroBase64Order.enhancedCoder.encodeByte(flagbyte)); // 1 char
// return result hash
return hash.toString();
}
private static char subdomPortPath(final String subdom, final int port, final String rootpath) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0);
}
@ -755,10 +755,25 @@ public class yacyURL implements Serializable {
return (urlHash.charAt(5) == rootURLFlag0) || (urlHash.charAt(5) == rootURLFlag1);
}
private static String protocolHostPort(final String protocol, final String host, final int port) {
/**
* compute a 5-byte hash fragment that can be used to identify the domain of the url
* @param protocol
* @param host
* @param port
* @return 5 bytes base64 encoded String representing the domain of the url
*/
public static final String hosthash(final String protocol, final String host, final int port) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5);
}
public static final String hosthash(final String host) {
return hosthash("http", host, 80);
}
public final String hosthash() {
return this.hash().substring(6, 11);
}
private static String[] testTLDs = new String[] { "com", "net", "org", "uk", "fr", "de", "es", "it" };
public static final yacyURL probablyWordURL(final String urlHash, final TreeSet<String> words) {
@ -769,7 +784,7 @@ public class yacyURL implements Serializable {
if ((word == null) || (word.length() == 0)) continue;
final String pattern = urlHash.substring(6, 11);
for (int i = 0; i < testTLDs.length; i++) {
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80)))
if (pattern.equals(hosthash("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80)))
try {
return new yacyURL("http://www." + word.toLowerCase() + "." + testTLDs[i], null);
} catch (final MalformedURLException e) {

Loading…
Cancel
Save