diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 8f164bc6d..41df95519 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -113,7 +113,7 @@ public class CrawlResults { final String hash = post.get("hash", null); if (hash != null) { // delete from database - sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(hash); + sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(hash.getBytes()); } } diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 7c5b7d69b..e95865164 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -228,7 +228,7 @@ public class Crawler_p { // stack request // first delete old entry, if exists final DigestURI url = new DigestURI(crawlingStart, null); - final String urlhash = url.hash(); + final byte[] urlhash = url.hash().getBytes(); indexSegment.urlMetadata().remove(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); sb.crawlQueues.errorURL.remove(urlhash); diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index de0f7b4de..dcd1c4df3 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -170,7 +170,7 @@ public class IndexControlRWIs_p { // now delete all urls if demanded if (delurl || delurlref) { for (i = 0; i < urlx.length; i++) { - sb.urlRemove(segment, urlx[i]); + sb.urlRemove(segment, urlx[i].getBytes()); } } post.remove("keyhashdeleteall"); @@ -186,7 +186,7 @@ public class IndexControlRWIs_p { } if (delurl || delurlref) { for (i = 0; i < urlx.length; i++) { - sb.urlRemove(segment, urlx[i]); + sb.urlRemove(segment, urlx[i].getBytes()); } } final Set urlHashes = new HashSet(); @@ -313,7 +313,7 @@ public class IndexControlRWIs_p { for (i = 0; i < urlx.length; i++) { urlHashes.add(urlx[i]); final URIMetadataRow e = segment.urlMetadata().load(urlx[i], null, 0); - segment.urlMetadata().remove(urlx[i]); + segment.urlMetadata().remove(urlx[i].getBytes()); if (e != null) { url = e.metadata().url(); pw.println(url.getHost() + "/" + url.getFile()); @@ -342,7 +342,7 @@ public class IndexControlRWIs_p { for (i = 0; i 40) ? (us.substring(0, 20) + "
" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "
" + us.substring(20)) : us)); prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn)); prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", DigestURI.domLengthEstimation(entry.hash())); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", RankingProcess.ybr(entry.hash())); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", RankingProcess.ybr(new String(entry.hash()))); prop.putNum("genUrlList_urlList_"+i+"_urlExists_tf", 1000.0 * entry.word().termFrequency()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(entry.hash())); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(new String(entry.hash()))); prop.put("genUrlList_urlList_"+i+"_urlExists_date", DateFormatter.formatShortDay(new Date(entry.word().lastModified()))); prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle()); prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext()); diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 896db3c62..a12c86262 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -153,7 +153,7 @@ public class IndexControlURLs_p { } else { urlstring = entry.metadata().url().toNormalform(false, true); prop.put("urlstring", ""); - sb.urlRemove(segment, urlhash); + sb.urlRemove(segment, urlhash.getBytes()); prop.putHTML("result", "Removed URL " + urlstring); } prop.put("lurlexport", 0); @@ -169,7 +169,7 @@ public class IndexControlURLs_p { if ((urlhash == null) || (urlstring == null)) { prop.put("result", "No input given; nothing deleted."); } else { - sb.urlRemove(segment, urlhash); + sb.urlRemove(segment, urlhash.getBytes()); prop.putHTML("result", "Removed URL " + urlstring); } prop.put("lurlexport", 0); @@ -223,7 +223,7 @@ public class IndexControlURLs_p { while (entryIt.hasNext() && i < 256) { entry = entryIt.next(); if (entry == null) break; - prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash()); + prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", new String(entry.hash())); cols++; if (cols==8) { prop.put("urlhashsimilar_rows_"+rows+"_cols", cols); @@ -340,7 +340,7 @@ public class IndexControlURLs_p { prop.put("genUrlProfile_loaddate", entry.loaddate().toString()); prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1); prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "" : le.metadata().url().toNormalform(false, true)); - prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : le.hash()); + prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : new String(le.hash())); prop.put("genUrlProfile_doctype", String.valueOf(entry.doctype())); prop.put("genUrlProfile_language", entry.language()); prop.put("genUrlProfile_size", entry.size()); diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index c4d4dc05d..7677d2c89 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -77,7 +77,7 @@ public class IndexCreateWWWGlobalQueue_p { prop.putNum("info_numEntries", c); } else if (post.containsKey("deleteEntry")) { final String urlHash = post.get("deleteEntry"); - sb.crawlQueues.noticeURL.removeByURLHash(urlHash); + sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes()); prop.put("LOCATION",""); return prop; } diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 097b39066..4f24013a8 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -134,7 +134,7 @@ public class IndexCreateWWWLocalQueue_p { if (value != null) { final Matcher matcher = compiledPattern.matcher(value); if (matcher.find()) { - sb.crawlQueues.noticeURL.removeByURLHash(entry.url().hash()); + sb.crawlQueues.noticeURL.removeByURLHash(entry.url().hash().getBytes()); } } } @@ -148,7 +148,7 @@ public class IndexCreateWWWLocalQueue_p { prop.putNum("info_numEntries", c); } else if (post.containsKey("deleteEntry")) { final String urlHash = post.get("deleteEntry"); - sb.crawlQueues.noticeURL.removeByURLHash(urlHash); + sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes()); prop.put("LOCATION",""); return prop; } diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java index c7d1994bf..6edc377c6 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.java +++ b/htroot/IndexCreateWWWRemoteQueue_p.java @@ -74,7 +74,7 @@ public class IndexCreateWWWRemoteQueue_p { prop.putNum("info_numEntries", c); } else if (post.containsKey("deleteEntry")) { final String urlHash = post.get("deleteEntry"); - sb.crawlQueues.noticeURL.removeByURLHash(urlHash); + sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes()); prop.put("LOCATION",""); return prop; } diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 3629430b6..8f0d94466 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -137,7 +137,7 @@ public class QuickCrawlLink_p { return prop; } - final String urlhash = crawlingStartURL.hash(); + final byte[] urlhash = crawlingStartURL.hash().getBytes(); indexSegment.urlMetadata().remove(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); sb.crawlQueues.errorURL.remove(urlhash); diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index 353d4ad28..24b42da8a 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -102,7 +102,7 @@ public class yacydoc { prop.putXML("dc_language", entry.language()); prop.putXML("yacy_loaddate", entry.loaddate().toString()); - prop.putXML("yacy_referrer_hash", (le == null) ? "" : le.hash()); + prop.putXML("yacy_referrer_hash", (le == null) ? "" : new String(le.hash())); prop.putXML("yacy_referrer_url", (le == null) ? "" : le.metadata().url().toNormalform(false, true)); prop.put("yacy_size", entry.size()); prop.put("yacy_words",entry.wordCount()); diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index bcd1c1f4d..00c08c274 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -117,14 +117,14 @@ public final class crawlReceipt { // generating a new loaded URL entry final URIMetadataRow entry = URIMetadataRow.importEntry(propStr); if (entry == null) { - log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); + if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "3600"); return prop; } final URIMetadataRow.Components metadata = entry.metadata(); if (metadata.url() == null) { - log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr); + if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + new String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "3600"); return prop; } @@ -132,7 +132,7 @@ public final class crawlReceipt { // check if the entry is in our network domain final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(metadata.url()); if (urlRejectReason != null) { - log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr); + if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + new String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "9999"); return prop; } @@ -142,7 +142,7 @@ public final class crawlReceipt { sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry); sb.crawlResults.stack(entry, youare, iam, EventOrigin.REMOTE_RECEIPTS); sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done - log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + metadata.url().toNormalform(false, true)); + if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + new String(entry.hash()) + ":" + metadata.url().toNormalform(false, true)); // ready for more prop.put("delay", "10"); diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 7d81f375a..540016f29 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -124,7 +124,7 @@ public class urls { prop.putXML("item_" + c + "_description", metadata.dc_title()); prop.put("item_" + c + "_author", metadata.dc_creator()); prop.put("item_" + c + "_pubDate", DateFormatter.formatShortSecond(entry.moddate())); - prop.put("item_" + c + "_guid", entry.hash()); + prop.put("item_" + c + "_guid", new String(entry.hash())); c++; } prop.put("item", c); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index be4c1da61..5c11bb8d0 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -406,7 +406,7 @@ public class yacysearch { // delete the index entry locally final String delHash = post.get("deleteref", ""); // urlhash - indexSegment.termIndex().remove(Word.words2hashes(query[0]), delHash); + indexSegment.termIndex().remove(Word.words2hashes(query[0]), delHash.getBytes()); // make new news message with negative voting final HashMap map = new HashMap(); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 562050357..899369955 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -116,9 +116,9 @@ public class yacysearchitem { prop.put("content_authorized", authenticated ? "1" : "0"); prop.put("content_authorized_recommend", (sb.peers.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0"); - prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*"); - prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*"); - prop.put("content_authorized_urlhash", result.hash()); + prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + new String(result.hash()) + "&urlmaskfilter=.*"); + prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + new String(result.hash()) + "&urlmaskfilter=.*"); + prop.put("content_authorized_urlhash", new String(result.hash())); prop.putHTML("content_title", result.title()); prop.putXML("content_title-xml", result.title()); @@ -126,12 +126,12 @@ public class yacysearchitem { prop.putHTML("content_link", result.urlstring()); prop.put("content_display", display); prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // aquire license for favicon url loading - prop.put("content_urlhash", result.hash()); - prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(result.hash())); + prop.put("content_urlhash", new String(result.hash())); + prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(new String(result.hash()))); prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), urllength)); prop.put("content_date", Switchboard.dateString(result.modified())); prop.put("content_date822", Switchboard.dateString822(result.modified())); - prop.put("content_ybr", RankingProcess.ybr(result.hash())); + prop.put("content_ybr", RankingProcess.ybr(new String(result.hash()))); prop.putHTML("content_size", Integer.toString(result.filesize())); // we don't use putNUM here because that number shall be usable as sorting key. To print the size, use 'sizename' prop.putHTML("content_sizename", sizename(result.filesize())); prop.putHTML("content_host", result.url().getHost()); @@ -146,8 +146,8 @@ public class yacysearchitem { } catch (final UnsupportedEncodingException e) {} prop.putHTML("content_former", theQuery.queryString); prop.put("content_rankingprops", result.word().toPropertyForm() + ", domLengthEstimated=" + DigestURI.domLengthEstimation(result.hash()) + - ((DigestURI.probablyRootURL(result.hash())) ? ", probablyRootURL" : "") + - (((wordURL = DigestURI.probablyWordURL(result.hash(), query[0])) != null) ? ", probablyWordURL=" + wordURL.toNormalform(false, true) : "")); + ((DigestURI.probablyRootURL(new String(result.hash()))) ? ", probablyRootURL" : "") + + (((wordURL = DigestURI.probablyWordURL(new String(result.hash()), query[0])) != null) ? ", probablyWordURL=" + wordURL.toNormalform(false, true) : "")); final TextSnippet snippet = result.textSnippet(); final String desc = (snippet == null) ? "" : snippet.getLineMarked(theQuery.fullqueryHashes); prop.put("content_description", desc); diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index f19561f56..ed7a922f6 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -46,7 +46,7 @@ import de.anomic.http.client.Cache; public class Balancer { private static final String indexSuffix = "9.db"; - private static final int EcoFSBufferSize = 200; + private static final int EcoFSBufferSize = 1000; // class variables private final ConcurrentHashMap> domainStacks; // a map from domain name part to Lists with url hashs diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 68a1a2e82..4e36be789 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -147,7 +147,7 @@ public class CrawlQueues { return null; } - public void urlRemove(final String hash) { + public void urlRemove(final byte[] hash) { noticeURL.removeByURLHash(hash); delegatedURL.remove(hash); errorURL.remove(hash); diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index bf76603b3..576b19770 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -180,9 +180,9 @@ public class NoticedURL { * @param urlhash * @return true, if the entry was removed; false if not */ - public boolean removeByURLHash(final String urlhash) { + public boolean removeByURLHash(final byte[] urlhashBytes) { final HashSet urlHashes = new HashSet(); - urlHashes.add(urlhash); + urlHashes.add(new String(urlhashBytes)); try {return coreStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {return limitStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {return remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {} diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index 7308b0658..3062c29e9 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -71,7 +71,7 @@ public final class ResultURLs { try { final LinkedHashMap resultStack = getStack(stackType); if (resultStack != null) { - resultStack.put(e.hash(), new InitExecEntry(initiatorHash, executorHash)); + resultStack.put(new String(e.hash()), new InitExecEntry(initiatorHash, executorHash)); } } catch (final Exception ex) { System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); @@ -188,7 +188,7 @@ public final class ResultURLs { EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING; System.out.println("valid test:\n======="); // add - results.stack(urlRef, urlRef.hash(), url.hash(), stackNo); + results.stack(urlRef, new String(urlRef.hash()), url.hash(), stackNo); // size System.out.println("size of stack:\t"+ results.getStackSize(stackNo)); } catch (final MalformedURLException e) { diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index ee256e9a7..0f9fce8c3 100755 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -109,11 +109,11 @@ public class ZURL implements Iterable { if (urlIndex != null) urlIndex.close(); } - public boolean remove(final String hash) { - if (hash == null) return false; + public boolean remove(final byte[] hashbytes) { + if (hashbytes == null) return false; //System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " remove " + hash); try { - urlIndex.remove(hash.getBytes()); + urlIndex.remove(hashbytes); return true; } catch (final IOException e) { return false; diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index 62f2db406..361c93e10 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -461,7 +461,7 @@ public class URLAnalysis { HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile), 0); System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize); for (byte[] refhash: hs) { - mr.remove(new String(refhash)); + mr.remove(refhash); } System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database"); } diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index a2ecd31b7..b02f7bec5 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -245,7 +245,7 @@ public class bookmarksDB { // check if the crawl filter works correctly Pattern.compile(newcrawlingMustMatch); - String urlhash = crawlingStartURL.hash(); + byte[] urlhash = crawlingStartURL.hash().getBytes(); sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); sb.crawlQueues.errorURL.remove(urlhash); diff --git a/source/de/anomic/search/MetadataRepository.java b/source/de/anomic/search/MetadataRepository.java index 8c7dc201d..c13bc723c 100644 --- a/source/de/anomic/search/MetadataRepository.java +++ b/source/de/anomic/search/MetadataRepository.java @@ -133,7 +133,7 @@ public final class MetadataRepository implements Iterable { // Check if there is a more recent Entry already in the DB URIMetadataRow oldEntry; try { - Row.Entry oe = (urlIndexFile == null) ? null : urlIndexFile.get(entry.hash().getBytes()); + Row.Entry oe = (urlIndexFile == null) ? null : urlIndexFile.get(entry.hash()); oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0); } catch (final Exception e) { Log.logException(e); @@ -156,10 +156,10 @@ public final class MetadataRepository implements Iterable { statsDump = null; } - public synchronized boolean remove(final String urlHash) { - if (urlHash == null) return false; + public synchronized boolean remove(final byte[] urlHashBytes) { + if (urlHashBytes == null) return false; try { - final Row.Entry r = urlIndexFile.remove(urlHash.getBytes()); + final Row.Entry r = urlIndexFile.remove(urlHashBytes); if (r != null) statsDump = null; return r != null; } catch (final IOException e) { @@ -267,15 +267,15 @@ public final class MetadataRepository implements Iterable { log.logInfo("URLs vorher: " + urlIndexFile.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size()); final Iterator eiter2 = damagedURLS.iterator(); - String urlHash; + byte[] urlHashBytes; while (eiter2.hasNext()) { - urlHash = eiter2.next(); + urlHashBytes = eiter2.next().getBytes(); // trying to fix the invalid URL String oldUrlStr = null; try { // getting the url data as byte array - final Row.Entry entry = urlIndexFile.get(urlHash.getBytes()); + final Row.Entry entry = urlIndexFile.get(urlHashBytes); // getting the wrong url string oldUrlStr = entry.getColString(1, null).trim(); @@ -302,15 +302,15 @@ public final class MetadataRepository implements Iterable { if (res != null && res.getStatusCode() == 200) { entry.setCol(1, newUrl.toString().getBytes()); urlIndexFile.put(entry); - log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr); + if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + new String(urlHashBytes) + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr); } else { - remove(urlHash); - log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + (res == null ? "null" : res.getStatusLine())); + remove(urlHashBytes); + if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + new String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + (res == null ? "null" : res.getStatusLine())); } } } catch (final Exception e) { - remove(urlHash); - log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage()); + remove(urlHashBytes); + if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + new String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage()); } } @@ -365,27 +365,27 @@ public final class MetadataRepository implements Iterable { final URIMetadataRow.Components metadata = entry.metadata(); totalSearchedUrls++; if (metadata == null) { - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "corrupted entry for hash = " + entry.hash()); + if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "corrupted entry for hash = " + new String(entry.hash())); remove(entry.hash()); continue; } if (metadata.url() == null) { - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + entry.hash() + "URL == null"); + if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + new String(entry.hash()) + "URL == null"); remove(entry.hash()); continue; } if (blacklist.isListed(Blacklist.BLACKLIST_CRAWLER, metadata.url()) || blacklist.isListed(Blacklist.BLACKLIST_DHT, metadata.url())) { lastBlacklistedUrl = metadata.url().toNormalform(true, true); - lastBlacklistedHash = entry.hash(); - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + entry.hash() + " " + metadata.url().toNormalform(false, true)); + lastBlacklistedHash = new String(entry.hash()); + if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + new String(entry.hash()) + " " + metadata.url().toNormalform(false, true)); remove(entry.hash()); if (blacklistedUrls % 100 == 0) { Log.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl); } } lastUrl = metadata.url().toNormalform(true, true); - lastHash = entry.hash(); + lastHash = new String(entry.hash()); } } } catch (final RuntimeException e) { @@ -502,7 +502,7 @@ public final class MetadataRepository implements Iterable { String url; while (i.hasNext()) { entry = i.next(); - if (this.set != null && !set.has(entry.hash().getBytes())) continue; + if (this.set != null && !set.has(entry.hash())) continue; metadata = entry.metadata(); url = metadata.url().toNormalform(true, false); if (!url.matches(filter)) continue; @@ -520,7 +520,7 @@ public final class MetadataRepository implements Iterable { if (metadata.dc_subject().length() > 0) pw.println("" + CharacterCoding.unicode2xml(metadata.dc_subject(), true) + ""); pw.println("" + entry.moddate().toString() + ""); pw.println("" + entry.size() + ""); - pw.println("" + entry.hash() + ""); + pw.println("" + new String(entry.hash()) + ""); pw.println(""); } count++; diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index acff75be1..c502e8224 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -472,7 +472,7 @@ public final class RankingProcess extends Thread { // accept url //System.out.println("handing over hash " + page.hash()); - this.handover.add(page.hash()); // remember that we handed over this url + this.handover.add(new String(page.hash())); // remember that we handed over this url return page; } return null; diff --git a/source/de/anomic/search/ReferenceOrder.java b/source/de/anomic/search/ReferenceOrder.java index 6f4646730..aee4910a3 100644 --- a/source/de/anomic/search/ReferenceOrder.java +++ b/source/de/anomic/search/ReferenceOrder.java @@ -145,7 +145,7 @@ public class ReferenceOrder { int maxmaxpos = max.maxposition(); int minminpos = min.minposition(); final long r = - ((256 - DigestURI.domLengthNormalized(t.metadataHash())) << ranking.coeff_domlength) + ((256 - DigestURI.domLengthNormalized(t.metadataHash().getBytes())) << ranking.coeff_domlength) + ((ranking.coeff_ybr > 12) ? ((256 - (RankingProcess.ybr(t.metadataHash()) << 4)) << ranking.coeff_ybr) : 0) + ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps) + ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength) diff --git a/source/de/anomic/search/ResultEntry.java b/source/de/anomic/search/ResultEntry.java index a2cde7941..e4dc52f3f 100644 --- a/source/de/anomic/search/ResultEntry.java +++ b/source/de/anomic/search/ResultEntry.java @@ -101,16 +101,16 @@ public class ResultEntry implements Comparable, Comparator, Comparator, Comparator post = yacyNetwork.basicRequestPost(Switchboard.getSwitchboard(), target.hash, salt); post.add(new DefaultCharsetStringPart("process", process)); - post.add(new DefaultCharsetStringPart("urlhash", ((entry == null) ? "" : entry.hash()))); + post.add(new DefaultCharsetStringPart("urlhash", ((entry == null) ? "" : new String(entry.hash())))); post.add(new DefaultCharsetStringPart("result", result)); post.add(new DefaultCharsetStringPart("reason", reason)); post.add(new DefaultCharsetStringPart("wordh", wordhashes)); diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index da793cdf2..33ab275af 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -950,11 +950,11 @@ public class DigestURI implements Serializable { return false; } - public static final int domLengthEstimation(final String urlHash) { + public static final int domLengthEstimation(final byte[] urlHashBytes) { // generates an estimation of the original domain length - assert (urlHash != null); - assert (urlHash.length() == 12) : "urlhash = " + urlHash; - final int flagbyte = Base64Order.enhancedCoder.decodeByte(urlHash.charAt(11)); + assert (urlHashBytes != null); + assert (urlHashBytes.length == 12) : "urlhash = " + new String(urlHashBytes); + final int flagbyte = Base64Order.enhancedCoder.decodeByte(urlHashBytes[11]); final int domLengthKey = flagbyte & 3; switch (domLengthKey) { case 0: @@ -969,8 +969,8 @@ public class DigestURI implements Serializable { return 20; } - public static int domLengthNormalized(final String urlHash) { - return domLengthEstimation(urlHash) << 8 / 20; + public static int domLengthNormalized(final byte[] urlHashBytes) { + return domLengthEstimation(urlHashBytes) << 8 / 20; } public static final int domDomain(final String urlHash) { diff --git a/source/net/yacy/kelondro/data/meta/URIMetadata.java b/source/net/yacy/kelondro/data/meta/URIMetadata.java index 098389b33..d1c7eb0b5 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadata.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadata.java @@ -38,7 +38,7 @@ public interface URIMetadata { public Row.Entry toRowEntry(); - public String hash(); + public byte[] hash(); public long ranking(); diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index b3d5c5ef9..f431da77f 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -311,7 +311,7 @@ public class URIMetadataRow implements URIMetadata { if (metadata == null) return null; //System.out.println("author=" + comp.author()); try { - s.append("hash=").append(hash()); + s.append("hash=").append(new String(hash())); s.append(",url=").append(crypt.simpleEncode(metadata.url().toNormalform(false, true))); s.append(",descr=").append(crypt.simpleEncode(metadata.dc_title())); s.append(",author=").append(crypt.simpleEncode(metadata.dc_creator())); @@ -353,12 +353,12 @@ public class URIMetadataRow implements URIMetadata { return this.entry; } - public String hash() { + public byte[] hash() { // return a url-hash, based on the md5 algorithm // the result is a String of 12 bytes within a 72-bit space // (each byte has an 6-bit range) // that should be enough for all web pages on the world - return new String(this.entry.getPrimaryKeyBytes()); + return this.entry.getPrimaryKeyBytes(); } public long ranking() { @@ -372,7 +372,7 @@ public class URIMetadataRow implements URIMetadata { final Iterator cl = FileUtils.strings(this.entry.getCol("comp", null)); this.comp = new Components( (cl.hasNext()) ? cl.next() : "", - hash(), + new String(hash()), (cl.hasNext()) ? cl.next() : "", (cl.hasNext()) ? cl.next() : "", (cl.hasNext()) ? cl.next() : "", diff --git a/source/net/yacy/kelondro/order/Base64Order.java b/source/net/yacy/kelondro/order/Base64Order.java index 727836446..60b2323d7 100644 --- a/source/net/yacy/kelondro/order/Base64Order.java +++ b/source/net/yacy/kelondro/order/Base64Order.java @@ -132,6 +132,10 @@ public class Base64Order extends AbstractOrder implements ByteOrder, Com return alpha[b]; } + public final byte decodeByte(final byte b) { + return ahpla[b]; + } + public final byte decodeByte(final char b) { return ahpla[b]; } diff --git a/source/net/yacy/kelondro/rwi/AbstractIndex.java b/source/net/yacy/kelondro/rwi/AbstractIndex.java index 9fc87684e..96ace1d73 100644 --- a/source/net/yacy/kelondro/rwi/AbstractIndex.java +++ b/source/net/yacy/kelondro/rwi/AbstractIndex.java @@ -45,13 +45,13 @@ public abstract class AbstractIndex implements this.factory = factory; } - public int remove(final TreeSet termHashes, final String urlHash) throws IOException { + public int remove(final TreeSet termHashes, final byte[] urlHashBytes) throws IOException { // remove the same url hashes for multiple words // this is mainly used when correcting a index after a search final Iterator i = termHashes.iterator(); int c = 0; while (i.hasNext()) { - if (remove(i.next(), urlHash)) c++; + if (remove(i.next(), urlHashBytes)) c++; } return c; } diff --git a/source/net/yacy/kelondro/rwi/Index.java b/source/net/yacy/kelondro/rwi/Index.java index 72f10ac04..10a1b365b 100644 --- a/source/net/yacy/kelondro/rwi/Index.java +++ b/source/net/yacy/kelondro/rwi/Index.java @@ -105,7 +105,7 @@ public interface Index { * @return * @throws IOException */ - public boolean remove(byte[] termHash, String referenceHash) throws IOException; + public boolean remove(byte[] termHash, byte[] referenceHash) throws IOException; /** * remove a set of reference entries for a given word @@ -115,7 +115,7 @@ public interface Index { * @throws IOException */ public int remove(final byte[] termHash, Set referenceHashes) throws IOException; - public int remove(final TreeSet termHashes, final String urlHash) throws IOException; + public int remove(final TreeSet termHashes, final byte[] urlHashBytes) throws IOException; /** * iterate all references from the beginning of a specific word hash diff --git a/source/net/yacy/kelondro/rwi/IndexCell.java b/source/net/yacy/kelondro/rwi/IndexCell.java index d6de1e39b..d2af11348 100644 --- a/source/net/yacy/kelondro/rwi/IndexCell.java +++ b/source/net/yacy/kelondro/rwi/IndexCell.java @@ -268,9 +268,9 @@ public final class IndexCell extends AbstractBu return removed + (reduced / this.array.rowdef().objectsize); } - public boolean remove(byte[] termHash, String urlHash) throws IOException { - boolean removed = this.ram.remove(termHash, urlHash); - int reduced = this.array.replace(termHash, new RemoveRewriter(urlHash)); + public boolean remove(byte[] termHash, byte[] urlHashBytes) throws IOException { + boolean removed = this.ram.remove(termHash, urlHashBytes); + int reduced = this.array.replace(termHash, new RemoveRewriter(urlHashBytes)); this.countCache.remove(new ByteArray(termHash)); return removed || (reduced > 0); } @@ -283,9 +283,9 @@ public final class IndexCell extends AbstractBu this.urlHashes = urlHashes; } - public RemoveRewriter(String urlHash) { + public RemoveRewriter(byte[] urlHashBytes) { this.urlHashes = new HashSet(); - this.urlHashes.add(urlHash); + this.urlHashes.add(new String(urlHashBytes)); } public ReferenceContainer rewrite(ReferenceContainer container) { diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java index b5b053b29..52893ce88 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java @@ -328,12 +328,12 @@ public final class ReferenceContainerCache exte return cache.remove(new ByteArray(termHash)); } - public boolean remove(final byte[] termHash, final String urlHash) { + public boolean remove(final byte[] termHash, final byte[] urlHashBytes) { assert this.cache != null; ByteArray tha = new ByteArray(termHash); synchronized (cache) { final ReferenceContainer c = cache.get(tha); - if ((c != null) && (c.remove(urlHash) != null)) { + if ((c != null) && (c.remove(urlHashBytes) != null)) { // removal successful if (c.isEmpty()) { delete(termHash);