redesign of reference hash (URL-hash) parameter hand-over:

pass value as byte[], not as String. This should cause that less
byte[] <-> String conversions are made during time-critical tasks.
This redesign is not yet complete, more to come ..

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6775 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 72d8e9897b
commit 1e8e79b9ef

@ -113,7 +113,7 @@ public class CrawlResults {
final String hash = post.get("hash", null);
if (hash != null) {
// delete from database
sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(hash);
sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(hash.getBytes());
}
}

@ -228,7 +228,7 @@ public class Crawler_p {
// stack request
// first delete old entry, if exists
final DigestURI url = new DigestURI(crawlingStart, null);
final String urlhash = url.hash();
final byte[] urlhash = url.hash().getBytes();
indexSegment.urlMetadata().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);

@ -170,7 +170,7 @@ public class IndexControlRWIs_p {
// now delete all urls if demanded
if (delurl || delurlref) {
for (i = 0; i < urlx.length; i++) {
sb.urlRemove(segment, urlx[i]);
sb.urlRemove(segment, urlx[i].getBytes());
}
}
post.remove("keyhashdeleteall");
@ -186,7 +186,7 @@ public class IndexControlRWIs_p {
}
if (delurl || delurlref) {
for (i = 0; i < urlx.length; i++) {
sb.urlRemove(segment, urlx[i]);
sb.urlRemove(segment, urlx[i].getBytes());
}
}
final Set<String> urlHashes = new HashSet<String>();
@ -313,7 +313,7 @@ public class IndexControlRWIs_p {
for (i = 0; i < urlx.length; i++) {
urlHashes.add(urlx[i]);
final URIMetadataRow e = segment.urlMetadata().load(urlx[i], null, 0);
segment.urlMetadata().remove(urlx[i]);
segment.urlMetadata().remove(urlx[i].getBytes());
if (e != null) {
url = e.metadata().url();
pw.println(url.getHost() + "/" + url.getFile());
@ -342,7 +342,7 @@ public class IndexControlRWIs_p {
for (i = 0; i<urlx.length; i++) {
urlHashes.add(urlx[i]);
final URIMetadataRow e = segment.urlMetadata().load(urlx[i], null, 0);
segment.urlMetadata().remove(urlx[i]);
segment.urlMetadata().remove(urlx[i].getBytes());
if (e != null) {
url = e.metadata().url();
pw.println(url.getHost() + "/.*");
@ -409,9 +409,9 @@ public class IndexControlRWIs_p {
prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "<br>" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "<br>" + us.substring(20)) : us));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", DigestURI.domLengthEstimation(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", RankingProcess.ybr(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", RankingProcess.ybr(new String(entry.hash())));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_tf", 1000.0 * entry.word().termFrequency());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(new String(entry.hash())));
prop.put("genUrlList_urlList_"+i+"_urlExists_date", DateFormatter.formatShortDay(new Date(entry.word().lastModified())));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext());

@ -153,7 +153,7 @@ public class IndexControlURLs_p {
} else {
urlstring = entry.metadata().url().toNormalform(false, true);
prop.put("urlstring", "");
sb.urlRemove(segment, urlhash);
sb.urlRemove(segment, urlhash.getBytes());
prop.putHTML("result", "Removed URL " + urlstring);
}
prop.put("lurlexport", 0);
@ -169,7 +169,7 @@ public class IndexControlURLs_p {
if ((urlhash == null) || (urlstring == null)) {
prop.put("result", "No input given; nothing deleted.");
} else {
sb.urlRemove(segment, urlhash);
sb.urlRemove(segment, urlhash.getBytes());
prop.putHTML("result", "Removed URL " + urlstring);
}
prop.put("lurlexport", 0);
@ -223,7 +223,7 @@ public class IndexControlURLs_p {
while (entryIt.hasNext() && i < 256) {
entry = entryIt.next();
if (entry == null) break;
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash());
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", new String(entry.hash()));
cols++;
if (cols==8) {
prop.put("urlhashsimilar_rows_"+rows+"_cols", cols);
@ -340,7 +340,7 @@ public class IndexControlURLs_p {
prop.put("genUrlProfile_loaddate", entry.loaddate().toString());
prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1);
prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "<unknown>" : le.metadata().url().toNormalform(false, true));
prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : le.hash());
prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : new String(le.hash()));
prop.put("genUrlProfile_doctype", String.valueOf(entry.doctype()));
prop.put("genUrlProfile_language", entry.language());
prop.put("genUrlProfile_size", entry.size());

@ -77,7 +77,7 @@ public class IndexCreateWWWGlobalQueue_p {
prop.putNum("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
sb.crawlQueues.noticeURL.removeByURLHash(urlHash);
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
prop.put("LOCATION","");
return prop;
}

@ -134,7 +134,7 @@ public class IndexCreateWWWLocalQueue_p {
if (value != null) {
final Matcher matcher = compiledPattern.matcher(value);
if (matcher.find()) {
sb.crawlQueues.noticeURL.removeByURLHash(entry.url().hash());
sb.crawlQueues.noticeURL.removeByURLHash(entry.url().hash().getBytes());
}
}
}
@ -148,7 +148,7 @@ public class IndexCreateWWWLocalQueue_p {
prop.putNum("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
sb.crawlQueues.noticeURL.removeByURLHash(urlHash);
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
prop.put("LOCATION","");
return prop;
}

@ -74,7 +74,7 @@ public class IndexCreateWWWRemoteQueue_p {
prop.putNum("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
sb.crawlQueues.noticeURL.removeByURLHash(urlHash);
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
prop.put("LOCATION","");
return prop;
}

@ -137,7 +137,7 @@ public class QuickCrawlLink_p {
return prop;
}
final String urlhash = crawlingStartURL.hash();
final byte[] urlhash = crawlingStartURL.hash().getBytes();
indexSegment.urlMetadata().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);

@ -102,7 +102,7 @@ public class yacydoc {
prop.putXML("dc_language", entry.language());
prop.putXML("yacy_loaddate", entry.loaddate().toString());
prop.putXML("yacy_referrer_hash", (le == null) ? "" : le.hash());
prop.putXML("yacy_referrer_hash", (le == null) ? "" : new String(le.hash()));
prop.putXML("yacy_referrer_url", (le == null) ? "" : le.metadata().url().toNormalform(false, true));
prop.put("yacy_size", entry.size());
prop.put("yacy_words",entry.wordCount());

@ -117,14 +117,14 @@ public final class crawlReceipt {
// generating a new loaded URL entry
final URIMetadataRow entry = URIMetadataRow.importEntry(propStr);
if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "3600");
return prop;
}
final URIMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + new String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "3600");
return prop;
}
@ -132,7 +132,7 @@ public final class crawlReceipt {
// check if the entry is in our network domain
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(metadata.url());
if (urlRejectReason != null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + new String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "9999");
return prop;
}
@ -142,7 +142,7 @@ public final class crawlReceipt {
sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry);
sb.crawlResults.stack(entry, youare, iam, EventOrigin.REMOTE_RECEIPTS);
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + metadata.url().toNormalform(false, true));
if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + new String(entry.hash()) + ":" + metadata.url().toNormalform(false, true));
// ready for more
prop.put("delay", "10");

@ -124,7 +124,7 @@ public class urls {
prop.putXML("item_" + c + "_description", metadata.dc_title());
prop.put("item_" + c + "_author", metadata.dc_creator());
prop.put("item_" + c + "_pubDate", DateFormatter.formatShortSecond(entry.moddate()));
prop.put("item_" + c + "_guid", entry.hash());
prop.put("item_" + c + "_guid", new String(entry.hash()));
c++;
}
prop.put("item", c);

@ -406,7 +406,7 @@ public class yacysearch {
// delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash
indexSegment.termIndex().remove(Word.words2hashes(query[0]), delHash);
indexSegment.termIndex().remove(Word.words2hashes(query[0]), delHash.getBytes());
// make new news message with negative voting
final HashMap<String, String> map = new HashMap<String, String>();

@ -116,9 +116,9 @@ public class yacysearchitem {
prop.put("content_authorized", authenticated ? "1" : "0");
prop.put("content_authorized_recommend", (sb.peers.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0");
prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", result.hash());
prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + new String(result.hash()) + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + new String(result.hash()) + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", new String(result.hash()));
prop.putHTML("content_title", result.title());
prop.putXML("content_title-xml", result.title());
@ -126,12 +126,12 @@ public class yacysearchitem {
prop.putHTML("content_link", result.urlstring());
prop.put("content_display", display);
prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // aquire license for favicon url loading
prop.put("content_urlhash", result.hash());
prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(result.hash()));
prop.put("content_urlhash", new String(result.hash()));
prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(new String(result.hash())));
prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), urllength));
prop.put("content_date", Switchboard.dateString(result.modified()));
prop.put("content_date822", Switchboard.dateString822(result.modified()));
prop.put("content_ybr", RankingProcess.ybr(result.hash()));
prop.put("content_ybr", RankingProcess.ybr(new String(result.hash())));
prop.putHTML("content_size", Integer.toString(result.filesize())); // we don't use putNUM here because that number shall be usable as sorting key. To print the size, use 'sizename'
prop.putHTML("content_sizename", sizename(result.filesize()));
prop.putHTML("content_host", result.url().getHost());
@ -146,8 +146,8 @@ public class yacysearchitem {
} catch (final UnsupportedEncodingException e) {}
prop.putHTML("content_former", theQuery.queryString);
prop.put("content_rankingprops", result.word().toPropertyForm() + ", domLengthEstimated=" + DigestURI.domLengthEstimation(result.hash()) +
((DigestURI.probablyRootURL(result.hash())) ? ", probablyRootURL" : "") +
(((wordURL = DigestURI.probablyWordURL(result.hash(), query[0])) != null) ? ", probablyWordURL=" + wordURL.toNormalform(false, true) : ""));
((DigestURI.probablyRootURL(new String(result.hash()))) ? ", probablyRootURL" : "") +
(((wordURL = DigestURI.probablyWordURL(new String(result.hash()), query[0])) != null) ? ", probablyWordURL=" + wordURL.toNormalform(false, true) : ""));
final TextSnippet snippet = result.textSnippet();
final String desc = (snippet == null) ? "" : snippet.getLineMarked(theQuery.fullqueryHashes);
prop.put("content_description", desc);

@ -46,7 +46,7 @@ import de.anomic.http.client.Cache;
public class Balancer {
private static final String indexSuffix = "9.db";
private static final int EcoFSBufferSize = 200;
private static final int EcoFSBufferSize = 1000;
// class variables
private final ConcurrentHashMap<String, LinkedList<String>> domainStacks; // a map from domain name part to Lists with url hashs

@ -147,7 +147,7 @@ public class CrawlQueues {
return null;
}
public void urlRemove(final String hash) {
public void urlRemove(final byte[] hash) {
noticeURL.removeByURLHash(hash);
delegatedURL.remove(hash);
errorURL.remove(hash);

@ -180,9 +180,9 @@ public class NoticedURL {
* @param urlhash
* @return true, if the entry was removed; false if not
*/
public boolean removeByURLHash(final String urlhash) {
public boolean removeByURLHash(final byte[] urlhashBytes) {
final HashSet<String> urlHashes = new HashSet<String>();
urlHashes.add(urlhash);
urlHashes.add(new String(urlhashBytes));
try {return coreStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {return limitStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {return remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {}

@ -71,7 +71,7 @@ public final class ResultURLs {
try {
final LinkedHashMap<String, InitExecEntry> resultStack = getStack(stackType);
if (resultStack != null) {
resultStack.put(e.hash(), new InitExecEntry(initiatorHash, executorHash));
resultStack.put(new String(e.hash()), new InitExecEntry(initiatorHash, executorHash));
}
} catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
@ -188,7 +188,7 @@ public final class ResultURLs {
EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
System.out.println("valid test:\n=======");
// add
results.stack(urlRef, urlRef.hash(), url.hash(), stackNo);
results.stack(urlRef, new String(urlRef.hash()), url.hash(), stackNo);
// size
System.out.println("size of stack:\t"+ results.getStackSize(stackNo));
} catch (final MalformedURLException e) {

@ -109,11 +109,11 @@ public class ZURL implements Iterable<ZURL.Entry> {
if (urlIndex != null) urlIndex.close();
}
public boolean remove(final String hash) {
if (hash == null) return false;
public boolean remove(final byte[] hashbytes) {
if (hashbytes == null) return false;
//System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " remove " + hash);
try {
urlIndex.remove(hash.getBytes());
urlIndex.remove(hashbytes);
return true;
} catch (final IOException e) {
return false;

@ -461,7 +461,7 @@ public class URLAnalysis {
HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile), 0);
System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);
for (byte[] refhash: hs) {
mr.remove(new String(refhash));
mr.remove(refhash);
}
System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database");
}

@ -245,7 +245,7 @@ public class bookmarksDB {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
String urlhash = crawlingStartURL.hash();
byte[] urlhash = crawlingStartURL.hash().getBytes();
sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);

@ -133,7 +133,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
// Check if there is a more recent Entry already in the DB
URIMetadataRow oldEntry;
try {
Row.Entry oe = (urlIndexFile == null) ? null : urlIndexFile.get(entry.hash().getBytes());
Row.Entry oe = (urlIndexFile == null) ? null : urlIndexFile.get(entry.hash());
oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0);
} catch (final Exception e) {
Log.logException(e);
@ -156,10 +156,10 @@ public final class MetadataRepository implements Iterable<byte[]> {
statsDump = null;
}
public synchronized boolean remove(final String urlHash) {
if (urlHash == null) return false;
public synchronized boolean remove(final byte[] urlHashBytes) {
if (urlHashBytes == null) return false;
try {
final Row.Entry r = urlIndexFile.remove(urlHash.getBytes());
final Row.Entry r = urlIndexFile.remove(urlHashBytes);
if (r != null) statsDump = null;
return r != null;
} catch (final IOException e) {
@ -267,15 +267,15 @@ public final class MetadataRepository implements Iterable<byte[]> {
log.logInfo("URLs vorher: " + urlIndexFile.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
final Iterator<String> eiter2 = damagedURLS.iterator();
String urlHash;
byte[] urlHashBytes;
while (eiter2.hasNext()) {
urlHash = eiter2.next();
urlHashBytes = eiter2.next().getBytes();
// trying to fix the invalid URL
String oldUrlStr = null;
try {
// getting the url data as byte array
final Row.Entry entry = urlIndexFile.get(urlHash.getBytes());
final Row.Entry entry = urlIndexFile.get(urlHashBytes);
// getting the wrong url string
oldUrlStr = entry.getColString(1, null).trim();
@ -302,15 +302,15 @@ public final class MetadataRepository implements Iterable<byte[]> {
if (res != null && res.getStatusCode() == 200) {
entry.setCol(1, newUrl.toString().getBytes());
urlIndexFile.put(entry);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + new String(urlHashBytes) + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else {
remove(urlHash);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + (res == null ? "null" : res.getStatusLine()));
remove(urlHashBytes);
if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + new String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + (res == null ? "null" : res.getStatusLine()));
}
}
} catch (final Exception e) {
remove(urlHash);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
remove(urlHashBytes);
if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + new String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
}
}
@ -365,27 +365,27 @@ public final class MetadataRepository implements Iterable<byte[]> {
final URIMetadataRow.Components metadata = entry.metadata();
totalSearchedUrls++;
if (metadata == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "corrupted entry for hash = " + entry.hash());
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "corrupted entry for hash = " + new String(entry.hash()));
remove(entry.hash());
continue;
}
if (metadata.url() == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + entry.hash() + "URL == null");
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + new String(entry.hash()) + "URL == null");
remove(entry.hash());
continue;
}
if (blacklist.isListed(Blacklist.BLACKLIST_CRAWLER, metadata.url()) ||
blacklist.isListed(Blacklist.BLACKLIST_DHT, metadata.url())) {
lastBlacklistedUrl = metadata.url().toNormalform(true, true);
lastBlacklistedHash = entry.hash();
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + entry.hash() + " " + metadata.url().toNormalform(false, true));
lastBlacklistedHash = new String(entry.hash());
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + new String(entry.hash()) + " " + metadata.url().toNormalform(false, true));
remove(entry.hash());
if (blacklistedUrls % 100 == 0) {
Log.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl);
}
}
lastUrl = metadata.url().toNormalform(true, true);
lastHash = entry.hash();
lastHash = new String(entry.hash());
}
}
} catch (final RuntimeException e) {
@ -502,7 +502,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
String url;
while (i.hasNext()) {
entry = i.next();
if (this.set != null && !set.has(entry.hash().getBytes())) continue;
if (this.set != null && !set.has(entry.hash())) continue;
metadata = entry.metadata();
url = metadata.url().toNormalform(true, false);
if (!url.matches(filter)) continue;
@ -520,7 +520,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
if (metadata.dc_subject().length() > 0) pw.println("<description>" + CharacterCoding.unicode2xml(metadata.dc_subject(), true) + "</description>");
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
pw.println("<yacy:size>" + entry.size() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + entry.hash() + "</guid>");
pw.println("<guid isPermaLink=\"false\">" + new String(entry.hash()) + "</guid>");
pw.println("</item>");
}
count++;

@ -472,7 +472,7 @@ public final class RankingProcess extends Thread {
// accept url
//System.out.println("handing over hash " + page.hash());
this.handover.add(page.hash()); // remember that we handed over this url
this.handover.add(new String(page.hash())); // remember that we handed over this url
return page;
}
return null;

@ -145,7 +145,7 @@ public class ReferenceOrder {
int maxmaxpos = max.maxposition();
int minminpos = min.minposition();
final long r =
((256 - DigestURI.domLengthNormalized(t.metadataHash())) << ranking.coeff_domlength)
((256 - DigestURI.domLengthNormalized(t.metadataHash().getBytes())) << ranking.coeff_domlength)
+ ((ranking.coeff_ybr > 12) ? ((256 - (RankingProcess.ybr(t.metadataHash()) << 4)) << ranking.coeff_ybr) : 0)
+ ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
+ ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength)

@ -101,16 +101,16 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
}
}
public int hashCode() {
return urlentry.hash().hashCode();
return new String(urlentry.hash()).hashCode();
}
public boolean equals(final Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (!(obj instanceof ResultEntry)) return false;
ResultEntry other = (ResultEntry) obj;
return urlentry.hash().equals(other.urlentry.hash());
return Base64Order.enhancedCoder.equal(urlentry.hash(), other.urlentry.hash());
}
public String hash() {
public byte[] hash() {
return urlentry.hash();
}
public DigestURI url() {
@ -171,9 +171,9 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
return urlentry.toString(textSnippet.getLineRaw());
}
public int compareTo(ResultEntry o) {
return Base64Order.enhancedCoder.compare(this.urlentry.hash().getBytes(), o.urlentry.hash().getBytes());
return Base64Order.enhancedCoder.compare(this.urlentry.hash(), o.urlentry.hash());
}
public int compare(ResultEntry o1, ResultEntry o2) {
return Base64Order.enhancedCoder.compare(o1.urlentry.hash().getBytes(), o2.urlentry.hash().getBytes());
return Base64Order.enhancedCoder.compare(o1.urlentry.hash(), o2.urlentry.hash());
}
}

@ -162,7 +162,7 @@ public class ResultFetcher {
// get next entry
page = rankedCache.takeURL(true, taketimeout);
if (page == null) break;
if (failedURLs.get(page.hash()) != null) continue;
if (failedURLs.get(new String(page.hash())) != null) continue;
final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0
@ -236,7 +236,7 @@ public class ResultFetcher {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
registerFailure(new String(page.hash()), "no text snippet for URL " + metadata.url());
if (!peers.mySeed().isVirgin())
try {
TextSnippet.failConsequences(query.getSegment(), page.word(), snippet, query.id(false));
@ -259,7 +259,7 @@ public class ResultFetcher {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
registerFailure(new String(page.hash()), "no media snippet for URL " + metadata.url());
return null;
}
}

@ -384,7 +384,7 @@ public class Segment {
}
if (resourceb == null) {
// delete just the url entry
urlMetadata().remove(urlhash);
urlMetadata().remove(urlhash.getBytes());
return 0;
} else {
resourceContent = new ByteArrayInputStream(resourceb);
@ -403,10 +403,10 @@ public class Segment {
// delete all word references
int count = 0;
if (words != null) count = termIndex().remove(Word.words2hashes(words), urlhash);
if (words != null) count = termIndex().remove(Word.words2hashes(words), urlhash.getBytes());
// finally delete the url entry itself
urlMetadata().remove(urlhash);
urlMetadata().remove(urlhash.getBytes());
return count;
}
} catch (final ParserException e) {

@ -1031,15 +1031,15 @@ public final class Switchboard extends serverSwitch {
return this.crawlQueues.urlExists(hash);
}
public void urlRemove(final Segment segment, final String hash) {
public void urlRemove(final Segment segment, final byte[] hash) {
segment.urlMetadata().remove(hash);
crawlResults.remove(hash);
crawlResults.remove(new String(hash));
crawlQueues.urlRemove(hash);
}
public void urlRemove(final Segments.Process process, final String hash) {
public void urlRemove(final Segments.Process process, final byte[] hash) {
indexSegments.urlMetadata(process).remove(hash);
crawlResults.remove(hash);
crawlResults.remove(new String(hash));
crawlQueues.urlRemove(hash);
}

@ -579,7 +579,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
public static String failConsequences(Segment indexSegment, final WordReferenceVars word, final TextSnippet snippet, final String eventID) throws IOException {
// problems with snippet fetch
final String urlHash = snippet.getUrl().hash();
final byte[] urlHash = snippet.getUrl().hash().getBytes();
final String querystring = SetTools.setToString(snippet.getRemainingHashes(), ' ');
if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) ||
(snippet.getErrorCode() == ERROR_RESOURCE_LOADING) ||

@ -554,32 +554,32 @@ public final class yacyClient {
// get one single search result
urlEntry = URIMetadataRow.importEntry(result.get("resource" + n));
if (urlEntry == null) continue;
assert (urlEntry.hash().length() == 12) : "urlEntry.hash() = " + urlEntry.hash();
if (urlEntry.hash().length() != 12) continue; // bad url hash
assert (urlEntry.hash().length == 12) : "urlEntry.hash() = " + new String(urlEntry.hash());
if (urlEntry.hash().length != 12) continue; // bad url hash
final URIMetadataRow.Components metadata = urlEntry.metadata();
if (metadata == null) continue;
if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) {
yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName());
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName());
continue; // block with backlist
}
final String urlRejectReason = Switchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(metadata.url());
if (urlRejectReason != null) {
yacyCore.log.logInfo("remote search (client): rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
continue; // reject url outside of our domain
}
// save the url entry
Reference entry;
if (urlEntry.word() == null) {
yacyCore.log.logWarning("remote search (client): no word attached from peer " + target.getName() + ", version " + target.getVersion());
if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search (client): no word attached from peer " + target.getName() + ", version " + target.getVersion());
continue; // no word attached
}
// the search-result-url transports all the attributes of word indexes
entry = urlEntry.word();
if (!(entry.metadataHash().equals(urlEntry.hash()))) {
yacyCore.log.logInfo("remote search (client): url-hash " + urlEntry.hash() + " does not belong to word-attached-hash " + entry.metadataHash() + "; url = " + metadata.url() + " from peer " + target.getName());
if (!Base64Order.enhancedCoder.equal(entry.metadataHash().getBytes(), urlEntry.hash())) {
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + entry.metadataHash() + "; url = " + metadata.url() + " from peer " + target.getName());
continue; // spammed
}
@ -597,7 +597,7 @@ public final class yacyClient {
// because they are search-specific.
// instead, they are placed in a snipped-search cache.
// System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'");
TextSnippet.storeToCache(wordhashes, urlEntry.hash(), urlEntry.snippet());
TextSnippet.storeToCache(wordhashes, new String(urlEntry.hash()), urlEntry.snippet());
}
// add the url entry to the word indexes
@ -611,7 +611,7 @@ public final class yacyClient {
}
// store url hash for statistics
urls[n] = urlEntry.hash();
urls[n] = new String(urlEntry.hash());
}
// store remote result to local result container
@ -846,7 +846,7 @@ public final class yacyClient {
final String salt = crypt.randomSalt();
final List<Part> post = yacyNetwork.basicRequestPost(Switchboard.getSwitchboard(), target.hash, salt);
post.add(new DefaultCharsetStringPart("process", process));
post.add(new DefaultCharsetStringPart("urlhash", ((entry == null) ? "" : entry.hash())));
post.add(new DefaultCharsetStringPart("urlhash", ((entry == null) ? "" : new String(entry.hash()))));
post.add(new DefaultCharsetStringPart("result", result));
post.add(new DefaultCharsetStringPart("reason", reason));
post.add(new DefaultCharsetStringPart("wordh", wordhashes));

@ -950,11 +950,11 @@ public class DigestURI implements Serializable {
return false;
}
public static final int domLengthEstimation(final String urlHash) {
public static final int domLengthEstimation(final byte[] urlHashBytes) {
// generates an estimation of the original domain length
assert (urlHash != null);
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
final int flagbyte = Base64Order.enhancedCoder.decodeByte(urlHash.charAt(11));
assert (urlHashBytes != null);
assert (urlHashBytes.length == 12) : "urlhash = " + new String(urlHashBytes);
final int flagbyte = Base64Order.enhancedCoder.decodeByte(urlHashBytes[11]);
final int domLengthKey = flagbyte & 3;
switch (domLengthKey) {
case 0:
@ -969,8 +969,8 @@ public class DigestURI implements Serializable {
return 20;
}
public static int domLengthNormalized(final String urlHash) {
return domLengthEstimation(urlHash) << 8 / 20;
public static int domLengthNormalized(final byte[] urlHashBytes) {
return domLengthEstimation(urlHashBytes) << 8 / 20;
}
public static final int domDomain(final String urlHash) {

@ -38,7 +38,7 @@ public interface URIMetadata {
public Row.Entry toRowEntry();
public String hash();
public byte[] hash();
public long ranking();

@ -311,7 +311,7 @@ public class URIMetadataRow implements URIMetadata {
if (metadata == null) return null;
//System.out.println("author=" + comp.author());
try {
s.append("hash=").append(hash());
s.append("hash=").append(new String(hash()));
s.append(",url=").append(crypt.simpleEncode(metadata.url().toNormalform(false, true)));
s.append(",descr=").append(crypt.simpleEncode(metadata.dc_title()));
s.append(",author=").append(crypt.simpleEncode(metadata.dc_creator()));
@ -353,12 +353,12 @@ public class URIMetadataRow implements URIMetadata {
return this.entry;
}
public String hash() {
public byte[] hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return new String(this.entry.getPrimaryKeyBytes());
return this.entry.getPrimaryKeyBytes();
}
public long ranking() {
@ -372,7 +372,7 @@ public class URIMetadataRow implements URIMetadata {
final Iterator<String> cl = FileUtils.strings(this.entry.getCol("comp", null));
this.comp = new Components(
(cl.hasNext()) ? cl.next() : "",
hash(),
new String(hash()),
(cl.hasNext()) ? cl.next() : "",
(cl.hasNext()) ? cl.next() : "",
(cl.hasNext()) ? cl.next() : "",

@ -132,6 +132,10 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Com
return alpha[b];
}
public final byte decodeByte(final byte b) {
return ahpla[b];
}
public final byte decodeByte(final char b) {
return ahpla[b];
}

@ -45,13 +45,13 @@ public abstract class AbstractIndex <ReferenceType extends Reference> implements
this.factory = factory;
}
public int remove(final TreeSet<byte[]> termHashes, final String urlHash) throws IOException {
public int remove(final TreeSet<byte[]> termHashes, final byte[] urlHashBytes) throws IOException {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<byte[]> i = termHashes.iterator();
int c = 0;
while (i.hasNext()) {
if (remove(i.next(), urlHash)) c++;
if (remove(i.next(), urlHashBytes)) c++;
}
return c;
}

@ -105,7 +105,7 @@ public interface Index <ReferenceType extends Reference> {
* @return
* @throws IOException
*/
public boolean remove(byte[] termHash, String referenceHash) throws IOException;
public boolean remove(byte[] termHash, byte[] referenceHash) throws IOException;
/**
* remove a set of reference entries for a given word
@ -115,7 +115,7 @@ public interface Index <ReferenceType extends Reference> {
* @throws IOException
*/
public int remove(final byte[] termHash, Set<String> referenceHashes) throws IOException;
public int remove(final TreeSet<byte[]> termHashes, final String urlHash) throws IOException;
public int remove(final TreeSet<byte[]> termHashes, final byte[] urlHashBytes) throws IOException;
/**
* iterate all references from the beginning of a specific word hash

@ -268,9 +268,9 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
return removed + (reduced / this.array.rowdef().objectsize);
}
public boolean remove(byte[] termHash, String urlHash) throws IOException {
boolean removed = this.ram.remove(termHash, urlHash);
int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHash));
public boolean remove(byte[] termHash, byte[] urlHashBytes) throws IOException {
boolean removed = this.ram.remove(termHash, urlHashBytes);
int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashBytes));
this.countCache.remove(new ByteArray(termHash));
return removed || (reduced > 0);
}
@ -283,9 +283,9 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
this.urlHashes = urlHashes;
}
public RemoveRewriter(String urlHash) {
public RemoveRewriter(byte[] urlHashBytes) {
this.urlHashes = new HashSet<String>();
this.urlHashes.add(urlHash);
this.urlHashes.add(new String(urlHashBytes));
}
public ReferenceContainer<ReferenceType> rewrite(ReferenceContainer<ReferenceType> container) {

@ -328,12 +328,12 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
return cache.remove(new ByteArray(termHash));
}
public boolean remove(final byte[] termHash, final String urlHash) {
public boolean remove(final byte[] termHash, final byte[] urlHashBytes) {
assert this.cache != null;
ByteArray tha = new ByteArray(termHash);
synchronized (cache) {
final ReferenceContainer<ReferenceType> c = cache.get(tha);
if ((c != null) && (c.remove(urlHash) != null)) {
if ((c != null) && (c.remove(urlHashBytes) != null)) {
// removal successful
if (c.isEmpty()) {
delete(termHash);

Loading…
Cancel
Save