From b18a7606a09e8ab34d3ad1cbd2b483b8b5b65c9d Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 25 Apr 2010 21:37:36 +0000 Subject: [PATCH] some performance hacks and fixed after reading dump in http://forum.yacy-websuche.de/viewtopic.php?p=19920#p19920 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6837 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlRWIs_p.java | 4 ++-- htroot/yacy/search.java | 3 ++- htroot/yacysearchitem.java | 9 ++++----- source/de/anomic/search/RankingProcess.java | 11 ++++++----- source/de/anomic/search/ReferenceOrder.java | 9 ++++----- source/de/anomic/yacy/yacySearch.java | 9 +-------- source/net/yacy/kelondro/blob/MapHeap.java | 18 +++++++++++++----- .../net/yacy/kelondro/rwi/AbstractIndex.java | 12 +++++++----- source/net/yacy/kelondro/rwi/Index.java | 3 ++- source/net/yacy/kelondro/rwi/TermSearch.java | 15 ++++++++------- 10 files changed, 49 insertions(+), 44 deletions(-) diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 9b576b8d1..8ad3555d3 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -416,9 +416,9 @@ public class IndexControlRWIs_p { prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "
" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "
" + us.substring(20)) : us)); prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn)); prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", DigestURI.domLengthEstimation(entry.hash())); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", RankingProcess.ybr(new String(entry.hash()))); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", RankingProcess.ybr(entry.hash())); prop.putNum("genUrlList_urlList_"+i+"_urlExists_tf", 1000.0 * entry.word().termFrequency()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(new String(entry.hash()))); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(entry.hash())); prop.put("genUrlList_urlList_"+i+"_urlExists_date", DateFormatter.formatShortDay(new Date(entry.word().lastModified()))); prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle()); prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext()); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index d026e24a8..c7ff42ffd 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -33,6 +33,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.TreeMap; import java.util.TreeSet; import net.yacy.document.content.RSSMessage; @@ -224,7 +225,7 @@ public final class search { final long timer = System.currentTimeMillis(); //final Map>[] containers = sb.indexSegment.index().searchTerm(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls)); - final HashMap> incc = indexSegment.termIndex().searchConjunction(theQuery.queryHashes, QueryParams.hashes2Handles(urls)); + final TreeMap> incc = indexSegment.termIndex().searchConjunction(theQuery.queryHashes, QueryParams.hashes2Handles(urls)); EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.COLLECTION, incc.size(), System.currentTimeMillis() - timer), false, 30000, ProfilingGraph.maxTime); if (incc != null) { diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index b7ac00dd9..0ab8ae923 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -119,19 +119,19 @@ public class yacysearchitem { prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + new String(result.hash()) + "&urlmaskfilter=.*"); prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + new String(result.hash()) + "&urlmaskfilter=.*"); prop.put("content_authorized_urlhash", new String(result.hash())); - + String resulthashString = new String(result.hash()); prop.putHTML("content_title", result.title()); prop.putXML("content_title-xml", result.title()); prop.putJSON("content_title-json", result.title()); prop.putHTML("content_link", result.urlstring()); prop.put("content_display", display); prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // aquire license for favicon url loading - prop.put("content_urlhash", new String(result.hash())); - prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(new String(result.hash()))); + prop.put("content_urlhash", resulthashString); + prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(resulthashString)); prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), urllength)); prop.put("content_date", Switchboard.dateString(result.modified())); prop.put("content_date822", Switchboard.dateString822(result.modified())); - prop.put("content_ybr", RankingProcess.ybr(new String(result.hash()))); + prop.put("content_ybr", RankingProcess.ybr(result.hash())); prop.putHTML("content_size", Integer.toString(result.filesize())); // we don't use putNUM here because that number shall be usable as sorting key. To print the size, use 'sizename' prop.putHTML("content_sizename", sizename(result.filesize())); prop.putHTML("content_host", result.url().getHost()); @@ -140,7 +140,6 @@ public class yacysearchitem { prop.put("content_nl", (item == 0) ? 0 : 1); final TreeSet[] query = theQuery.queryWords(); - DigestURI wordURL = null; try { prop.putHTML("content_words", URLEncoder.encode(query[0].toString(),"UTF-8")); } catch (final UnsupportedEncodingException e) {} diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index edc4f7ea8..6cd2621ae 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -31,10 +31,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.ConcurrentModificationException; -import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; @@ -72,7 +72,7 @@ public final class RankingProcess extends Thread { private final int[] flagcount; // flag counter private final HandleSet misses; // contains url-hashes that could not been found in the LURL-DB //private final int[] domZones; - private HashMap> localSearchInclusion; + private TreeMap> localSearchInclusion; private int remote_resourceSize, remote_indexCount, remote_peerCount; private int local_resourceSize, local_indexCount; @@ -683,14 +683,15 @@ public final class RankingProcess extends Thread { useYBR = usage; } - public static int ybr(final String urlHash) { + public static int ybr(final byte[] urlHash) { // returns the YBR value in a range of 0..15, where 0 means best ranking and 15 means worst ranking if (ybrTables == null) return 15; if (!(useYBR)) return 15; - final String domHash = urlHash.substring(6); + byte[] domhash = new byte[6]; + System.arraycopy(urlHash, 6, domhash, 0, 6); final int m = Math.min(maxYBR, ybrTables.length); for (int i = 0; i < m; i++) { - if ((ybrTables[i] != null) && (ybrTables[i].contains(domHash.getBytes()))) { + if ((ybrTables[i] != null) && (ybrTables[i].contains(domhash))) { //System.out.println("YBR FOUND: " + urlHash + " (" + i + ")"); return i; } diff --git a/source/de/anomic/search/ReferenceOrder.java b/source/de/anomic/search/ReferenceOrder.java index a6603e81b..6cd022611 100644 --- a/source/de/anomic/search/ReferenceOrder.java +++ b/source/de/anomic/search/ReferenceOrder.java @@ -177,8 +177,8 @@ public class ReferenceOrder { } } - public int authority(final String urlHash) { - return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount); + public int authority(final byte[] urlHash) { + return (doms.getScore(new String(urlHash, 6, 6)) << 8) / (1 + this.maxdomcount); } public long cardinal(final WordReferenceVars t) { @@ -193,10 +193,9 @@ public class ReferenceOrder { //System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf); int maxmaxpos = max.maxposition(); int minminpos = min.minposition(); - String mdhb = new String(t.metadataHash()); final long r = ((256 - DigestURI.domLengthNormalized(t.metadataHash())) << ranking.coeff_domlength) - + ((ranking.coeff_ybr > 12) ? ((256 - (RankingProcess.ybr(mdhb) << 4)) << ranking.coeff_ybr) : 0) + + ((ranking.coeff_ybr > 12) ? ((256 - (RankingProcess.ybr(t.metadataHash()) << 4)) << ranking.coeff_ybr) : 0) + ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps) + ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength) + ((maxmaxpos == minminpos ) ? 0 : (256 - (((t.minposition() - minminpos ) << 8) / (maxmaxpos - minminpos) )) << ranking.coeff_posintext) @@ -211,7 +210,7 @@ public class ReferenceOrder { + ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother) + ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + tf - + ((ranking.coeff_authority > 12) ? (authority(mdhb) << ranking.coeff_authority) : 0) + + ((ranking.coeff_authority > 12) ? (authority(t.metadataHash()) << ranking.coeff_authority) : 0) + ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0) + ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0) + ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0) diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 6f0e2bf9d..1c764e7ff 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -135,14 +135,7 @@ public class yacySearch extends Thread { containerCache.oneFeederTerminated(); } } - /* - public static String set2string(final TreeSet hashes) { - String wh = ""; - final Iterator iter = hashes.iterator(); - while (iter.hasNext()) { wh = wh + new String(iter.next()); } - return wh; - } - */ + public static String set2string(final HandleSet hashes) { String wh = ""; final Iterator iter = hashes.iterator(); diff --git a/source/net/yacy/kelondro/blob/MapHeap.java b/source/net/yacy/kelondro/blob/MapHeap.java index 983a17a59..e85283880 100644 --- a/source/net/yacy/kelondro/blob/MapHeap.java +++ b/source/net/yacy/kelondro/blob/MapHeap.java @@ -227,8 +227,16 @@ public class MapHeap { private String normalizeKey(String key) { if (blob == null || key == null) return key; - if (key.length() > blob.keylength()) key = key.substring(0, blob.keylength()); - while (key.length() < blob.keylength()) key += fillchar; + if (key.length() > blob.keylength()) { + return key.substring(0, blob.keylength()); + } + if (key.length() < blob.keylength()) { + byte[] k = key.getBytes(); + byte[] b = new byte[blob.keylength()]; + System.arraycopy(k, 0, b, 0, k.length); + for (int i = k.length; i < b.length; i++) b[i] = (byte) fillchar; + return new String(b); + } return key; } @@ -237,13 +245,13 @@ public class MapHeap { if (key.length > blob.keylength()) { byte[] b = new byte[blob.keylength()]; System.arraycopy(key, 0, b, 0, blob.keylength()); - key = b; + return b; } if (key.length < blob.keylength()) { byte[] b = new byte[blob.keylength()]; System.arraycopy(key, 0, b, 0, key.length); - for (int i = key.length; i < blob.keylength(); i++) b[i] = (byte) fillchar; - key = b; + for (int i = key.length; i < b.length; i++) b[i] = (byte) fillchar; + return b; } return key; } diff --git a/source/net/yacy/kelondro/rwi/AbstractIndex.java b/source/net/yacy/kelondro/rwi/AbstractIndex.java index d1cd203f7..9e262b58d 100644 --- a/source/net/yacy/kelondro/rwi/AbstractIndex.java +++ b/source/net/yacy/kelondro/rwi/AbstractIndex.java @@ -30,11 +30,13 @@ package net.yacy.kelondro.rwi; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; +import java.util.TreeMap; import java.util.TreeSet; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Order; public abstract class AbstractIndex implements Index { @@ -91,16 +93,16 @@ public abstract class AbstractIndex implements * @param urlselection * @return map of wordhash:indexContainer */ - public HashMap> searchConjunction(final HandleSet wordHashes, final HandleSet urlselection) { + public TreeMap> searchConjunction(final HandleSet wordHashes, final HandleSet urlselection) { // first check if there is any entry that has no match; this uses only operations in ram /* Iterator i = wordHashes.iterator(); while (i.hasNext()) { - if (!this.has(i.next())); return new HashMap>(0); + if (!this.has(i.next())); return new TreeMap>(0); } */ // retrieve entities that belong to the hashes - final HashMap> containers = new HashMap>(wordHashes.size()); + final TreeMap> containers = new TreeMap>(Base64Order.enhancedCoder); byte[] singleHash; ReferenceContainer singleContainer; final Iterator i = wordHashes.iterator(); @@ -118,7 +120,7 @@ public abstract class AbstractIndex implements } // check result - if ((singleContainer == null || singleContainer.isEmpty())) return new HashMap>(0); + if ((singleContainer == null || singleContainer.isEmpty())) return new TreeMap>(Base64Order.enhancedCoder); containers.put(singleHash, singleContainer); } @@ -136,7 +138,7 @@ public abstract class AbstractIndex implements * @return ReferenceContainer the join result * @throws RowSpaceExceededException */ - public ReferenceContainer searchJoin(final TreeSet wordHashes, final HandleSet urlselection, final int maxDistance) throws RowSpaceExceededException { + public ReferenceContainer searchJoin(final HandleSet wordHashes, final HandleSet urlselection, final int maxDistance) throws RowSpaceExceededException { // first check if there is any entry that has no match; // this uses only operations in ram for (byte[] wordHash: wordHashes) { diff --git a/source/net/yacy/kelondro/rwi/Index.java b/source/net/yacy/kelondro/rwi/Index.java index 5cfc1af18..aa2a9ef85 100644 --- a/source/net/yacy/kelondro/rwi/Index.java +++ b/source/net/yacy/kelondro/rwi/Index.java @@ -30,6 +30,7 @@ package net.yacy.kelondro.rwi; import java.io.IOException; import java.util.HashMap; +import java.util.TreeMap; import java.util.TreeSet; import net.yacy.kelondro.index.HandleSet; @@ -144,7 +145,7 @@ public interface Index { * @param urlselection * @return map of wordhash:indexContainer */ - public HashMap> searchConjunction(final HandleSet wordHashes, final HandleSet urlselection); + public TreeMap> searchConjunction(final HandleSet wordHashes, final HandleSet urlselection); /** * delete all references entries diff --git a/source/net/yacy/kelondro/rwi/TermSearch.java b/source/net/yacy/kelondro/rwi/TermSearch.java index 4f2bebf4e..a7fce041e 100644 --- a/source/net/yacy/kelondro/rwi/TermSearch.java +++ b/source/net/yacy/kelondro/rwi/TermSearch.java @@ -27,16 +27,17 @@ package net.yacy.kelondro.rwi; -import java.util.HashMap; +import java.util.TreeMap; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; +import net.yacy.kelondro.order.Base64Order; public class TermSearch { private final ReferenceContainer joinResult; - HashMap> inclusionContainers; + TreeMap> inclusionContainers; public TermSearch( Index base, @@ -48,16 +49,16 @@ public class TermSearch { this.inclusionContainers = (queryHashes.isEmpty()) ? - new HashMap>(0) : + new TreeMap>(Base64Order.enhancedCoder) : base.searchConjunction(queryHashes, urlselection); if (!inclusionContainers.isEmpty() && (inclusionContainers.size() < queryHashes.size())) - inclusionContainers = new HashMap>(0); // prevent that only a subset is returned + inclusionContainers = new TreeMap>(Base64Order.enhancedCoder); // prevent that only a subset is returned - HashMap> exclusionContainers = + TreeMap> exclusionContainers = (inclusionContainers.isEmpty()) ? - new HashMap>(0) : + new TreeMap>(Base64Order.enhancedCoder) : base.searchConjunction(excludeHashes, urlselection); // join and exclude the result @@ -72,7 +73,7 @@ public class TermSearch { return this.joinResult; } - public HashMap> inclusion() { + public TreeMap> inclusion() { return this.inclusionContainers; }