From 727feb4358cbdfa9c47c9b5b6f8b86e3d5210f91 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 21 Feb 2008 10:06:57 +0000 Subject: [PATCH] - fixed some bugs in ranking computation - introduced generalized method to organize ranked results (2 new classes) - added a post-ranking after snippet-fetch (before: only listed) using the new ranking data structures - fixed some missing data fields in RWI ranking attributes and correct hand-over between data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4498 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/search.java | 9 +- source/de/anomic/index/indexRWIEntry.java | 1 + .../de/anomic/index/indexRWIEntryOrder.java | 80 +++----- source/de/anomic/index/indexRWIRowEntry.java | 3 + source/de/anomic/index/indexRWIVarEntry.java | 176 ++++++++++++------ .../de/anomic/kelondro/kelondroSortStack.java | 147 +++++++++++++++ .../de/anomic/kelondro/kelondroSortStore.java | 135 ++++++++++++++ .../de/anomic/plasma/plasmaSearchEvent.java | 93 ++++----- .../plasma/plasmaSearchRankingProcess.java | 148 +++++++-------- yacy.network.group | 2 +- 10 files changed, 542 insertions(+), 252 deletions(-) create mode 100644 source/de/anomic/kelondro/kelondroSortStack.java create mode 100644 source/de/anomic/kelondro/kelondroSortStore.java diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index f64e0beb2..bf4c64187 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -38,6 +38,7 @@ import de.anomic.http.httpHeader; import de.anomic.index.indexContainer; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; +import de.anomic.kelondro.kelondroSortStack; import de.anomic.net.natLib; import de.anomic.plasma.plasmaProfiling; import de.anomic.plasma.plasmaSearchEvent; @@ -148,7 +149,7 @@ public final class search { int indexabstractContainercount = 0; int joincount = 0; plasmaSearchQuery theQuery = null; - ArrayList accu = null; + ArrayList.stackElement> accu = null; plasmaSearchEvent theSearch = null; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts @@ -258,10 +259,10 @@ public final class search { long timer = System.currentTimeMillis(); StringBuffer links = new StringBuffer(); String resource = null; - plasmaSearchEvent.ResultEntry entry; + kelondroSortStack.stackElement entry; for (int i = 0; i < accu.size(); i++) { - entry = (plasmaSearchEvent.ResultEntry) accu.get(i); - resource = entry.resource(); + entry = accu.get(i); + resource = entry.element.resource(); if (resource != null) { links.append("resource").append(i).append('=').append(resource).append(serverCore.CRLF_STRING); } diff --git a/source/de/anomic/index/indexRWIEntry.java b/source/de/anomic/index/indexRWIEntry.java index 47d79efe6..5dbb4e620 100644 --- a/source/de/anomic/index/indexRWIEntry.java +++ b/source/de/anomic/index/indexRWIEntry.java @@ -86,4 +86,5 @@ public interface indexRWIEntry { public boolean isOlder(indexRWIEntry other); + public int hashCode(); } \ No newline at end of file diff --git a/source/de/anomic/index/indexRWIEntryOrder.java b/source/de/anomic/index/indexRWIEntryOrder.java index e25260d00..15b4b0679 100644 --- a/source/de/anomic/index/indexRWIEntryOrder.java +++ b/source/de/anomic/index/indexRWIEntryOrder.java @@ -31,16 +31,14 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import de.anomic.kelondro.kelondroAbstractOrder; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMScoreCluster; -import de.anomic.kelondro.kelondroOrder; import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.yacy.yacyURL; -public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondroOrder { +public class indexRWIEntryOrder { private indexRWIVarEntry min, max; private plasmaSearchRankingProfile ranking; private kelondroMScoreCluster doms; // collected for "authority" heuristic @@ -69,8 +67,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder mmf0.start(); // fork here minmaxfinder mmf1 = new minmaxfinder(container, middle, container.size()); mmf1.run(); // execute other fork in this thread - if (this.min == null) this.min = mmf1.entryMin; else indexRWIVarEntry.min(this.min, mmf1.entryMin); - if (this.max == null) this.max = mmf1.entryMax; else indexRWIVarEntry.max(this.max, mmf1.entryMax); + if (this.min == null) this.min = mmf1.entryMin.clone(); else this.min.min(mmf1.entryMin); + if (this.max == null) this.max = mmf1.entryMax.clone(); else this.max.max(mmf1.entryMax); Map.Entry entry; Iterator> di = mmf1.domcount().entrySet().iterator(); while (di.hasNext()) { @@ -78,8 +76,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue()); } try {mmf0.join();} catch (InterruptedException e) {} // wait for fork thread to finish - if (this.min == null) this.min = mmf0.entryMin; else indexRWIVarEntry.min(this.min, mmf0.entryMin); - if (this.max == null) this.max = mmf0.entryMax; else indexRWIVarEntry.max(this.max, mmf0.entryMax); + if (this.min == null) this.min = mmf0.entryMin.clone(); else this.min.min(mmf0.entryMin); + if (this.max == null) this.max = mmf0.entryMax.clone(); else this.max.max(mmf0.entryMax); di = mmf0.domcount().entrySet().iterator(); while (di.hasNext()) { entry = di.next(); @@ -93,8 +91,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder // run minmax in one thread minmaxfinder mmf = new minmaxfinder(container, 0, container.size()); mmf.run(); // execute without multi-threading - if (this.min == null) this.min = mmf.entryMin; else indexRWIVarEntry.min(this.min, mmf.entryMin); - if (this.max == null) this.max = mmf.entryMax; else indexRWIVarEntry.max(this.max, mmf.entryMax); + if (this.min == null) this.min = mmf.entryMin.clone(); else this.min.min(mmf.entryMin); + if (this.max == null) this.max = mmf.entryMax.clone(); else this.max.max(mmf.entryMax); Map.Entry entry; Iterator> di = mmf.domcount().entrySet().iterator(); while (di.hasNext()) { @@ -109,44 +107,34 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder return result; } - public kelondroOrder clone() { - return null; - } - public int authority(String urlHash) { return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount); } - - public long cardinal(byte[] key) { - return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key))); - } - - public long cardinal(indexRWIRowEntry t) { - return cardinal(new indexRWIVarEntry(t)); - } public long cardinal(indexRWIVarEntry t) { //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords); // the normalizedEntry must be a normalized indexEntry kelondroBitfield flags = t.flags(); + long tf = ((max.termFrequency() == min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-min.termFrequency())*256.0)/(max.termFrequency() - min.termFrequency())))) << ranking.coeff_termfrequency); + //System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf); long r = - ((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength) + ((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength) + ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr) - + ((t.urlcomps() == 0) ? 0 : ((256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)) - + ((t.urllength() == 0) ? 0 : ((256 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength)) - + ((t.posintext() == 0) ? 0 : ((256 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext)) - + ((t.posofphrase() == 0) ? 0 : ((256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)) - + ((t.posinphrase() == 0) ? 0 : ((256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)) - + ((256 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance) - + ( (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date) - + ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle) - + ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext) - + ( (((t.phrasesintext()- min.phrasesintext() ) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext) - + ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal) - + ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother) - + ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) - + (((int)((((t.termFrequency()- min.termFrequency() )*256.0)/ (1 + max.termFrequency()- min.termFrequency()))))<< ranking.coeff_termfrequency) - + ( authority(t.urlHash()) << ranking.coeff_authority) + + ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps) + + ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength) + + ((max.posintext() == min.posintext() ) ? 0 : (256 - (((t.posintext() - min.posintext() ) << 8) / (max.posintext() - min.posintext()) )) << ranking.coeff_posintext) + + ((max.posofphrase() == min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase) + + ((max.posinphrase() == min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase) + + ((max.worddistance() == min.worddistance()) ? 0 : (256 - (((t.worddistance() - min.worddistance() ) << 8) / (max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance) + + ((max.virtualAge() == min.virtualAge()) ? 0 : (((t.virtualAge() - min.virtualAge() ) << 8) / (max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date) + + ((max.wordsintitle() == min.wordsintitle()) ? 0 : (((t.wordsintitle() - min.wordsintitle() ) << 8) / (max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle) + + ((max.wordsintext() == min.wordsintext()) ? 0 : (((t.wordsintext() - min.wordsintext() ) << 8) / (max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext) + + ((max.phrasesintext() == min.phrasesintext()) ? 0 : (((t.phrasesintext()- min.phrasesintext() ) << 8) / (max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext) + + ((max.llocal() == min.llocal()) ? 0 : (((t.llocal() - min.llocal() ) << 8) / (max.llocal() - min.llocal()) ) << ranking.coeff_llocal) + + ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother) + + ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + + tf + + (authority(t.urlHash()) << ranking.coeff_authority) + (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)) + (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)) + (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)) @@ -163,20 +151,6 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap } - - public int compare(indexRWIVarEntry a, indexRWIVarEntry b) { - long ca = cardinal(a); - long cb = cardinal(b); - return (ca > cb) ? 1 : (ca < cb) ? -1 : 0; - } - - public String signature() { - return "rx"; - } - - public boolean wellformed(indexRWIVarEntry a) { - return true; - } public static class minmaxfinder extends Thread { @@ -208,8 +182,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++))); this.decodedEntries.add(iEntry); // find min/max - if (this.entryMin == null) this.entryMin = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.min(this.entryMin, iEntry); - if (this.entryMax == null) this.entryMax = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.max(this.entryMax, iEntry); + if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry); + if (this.entryMax == null) this.entryMax = iEntry.clone(); else this.entryMax.max(iEntry); // update domcount dom = iEntry.urlHash().substring(6); count = (Integer) doms.get(dom); diff --git a/source/de/anomic/index/indexRWIRowEntry.java b/source/de/anomic/index/indexRWIRowEntry.java index 61a89cfd5..476d6f2ea 100644 --- a/source/de/anomic/index/indexRWIRowEntry.java +++ b/source/de/anomic/index/indexRWIRowEntry.java @@ -269,4 +269,7 @@ public final class indexRWIRowEntry implements indexRWIEntry { return false; } + public int hashCode() { + return this.urlHash().hashCode(); + } } \ No newline at end of file diff --git a/source/de/anomic/index/indexRWIVarEntry.java b/source/de/anomic/index/indexRWIVarEntry.java index 3be6f6cbd..fd55e5a55 100644 --- a/source/de/anomic/index/indexRWIVarEntry.java +++ b/source/de/anomic/index/indexRWIVarEntry.java @@ -27,6 +27,7 @@ package de.anomic.index; import de.anomic.kelondro.kelondroBitfield; +import de.anomic.plasma.plasmaWordIndex; public class indexRWIVarEntry implements indexRWIEntry { @@ -40,7 +41,52 @@ public class indexRWIVarEntry implements indexRWIEntry { worddistance, wordsintext, wordsintitle; public double termFrequency; - public indexRWIVarEntry(indexRWIEntry e) { + public indexRWIVarEntry(String urlHash, + int urlLength, // byte-length of complete URL + int urlComps, // number of path components + int titleLength, // length of description/length (longer are better?) + int hitcount, // how often appears this word in the text + int wordcount, // total number of words + int phrasecount, // total number of phrases + int posintext, // position of word in all words + int posinphrase, // position of word in its phrase + int posofphrase, // number of the phrase where word appears + long lastmodified, // last-modified time of the document where word appears + long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short + String language, // (guessed) language of document + char doctype, // type of document + int outlinksSame, // outlinks to same domain + int outlinksOther, // outlinks to other domain + kelondroBitfield flags, // attributes to the url and to the word according the url + int worddistance, + double termfrequency + ) { + if ((language == null) || (language.length() != 2)) language = "uk"; + int mddlm = plasmaWordIndex.microDateDays(lastmodified); + int mddct = plasmaWordIndex.microDateDays(updatetime); + this.flags = flags; + this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2); + this.lastModified = lastmodified; + this.language = language; + this.urlHash = urlHash; + this.type = doctype; + this.hitcount = hitcount; + this.llocal = outlinksSame; + this.lother = outlinksOther; + this.phrasesintext = outlinksOther; + this.posintext = posintext; + this.posinphrase = posinphrase; + this.posofphrase = posofphrase; + this.urlcomps = urlComps; + this.urllength = urlLength; + this.virtualAge = mddlm; + this.worddistance = worddistance; + this.wordsintext = wordcount; + this.wordsintitle = titleLength; + this.termFrequency = termfrequency; + } + + public indexRWIVarEntry(indexRWIRowEntry e) { this.flags = e.flags(); this.freshUntil = e.freshUntil(); this.lastModified = e.lastModified(); @@ -60,18 +106,43 @@ public class indexRWIVarEntry implements indexRWIEntry { this.worddistance = 0; this.wordsintext = e.wordsintext(); this.wordsintitle = e.wordsintitle(); - this.termFrequency = 0.0; + this.termFrequency = e.termFrequency(); + } + + public indexRWIVarEntry clone() { + indexRWIVarEntry c = new indexRWIVarEntry( + this.urlHash, + this.urllength, + this.urlcomps, + this.wordsintitle, + this.hitcount, + this.wordsintext, + this.phrasesintext, + this.posintext, + this.posinphrase, + this.posofphrase, + this.lastModified, + System.currentTimeMillis(), + this.language, + this.type, + this.llocal, + this.lother, + this.flags, + this.worddistance, + this.termFrequency); + return c; } public void join(indexRWIVarEntry oe) { // combine the distance - this.worddistance = this.worddistance() + oe.worddistance() + Math.abs(this.posintext() - oe.posintext()); - this.posintext = Math.min(this.posintext(), oe.posintext()); - this.posinphrase = (this.posofphrase() == oe.posofphrase()) ? Math.min(this.posinphrase(), oe.posinphrase()) : 0; - this.posofphrase = Math.min(this.posofphrase(), oe.posofphrase()); + this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext); + this.posintext = Math.min(this.posintext, oe.posintext); + this.posinphrase = (this.posofphrase == oe.posofphrase) ? Math.min(this.posinphrase, oe.posinphrase) : 0; + this.posofphrase = Math.min(this.posofphrase, oe.posofphrase); // combine term frequency - this.wordsintext = this.wordsintext() + oe.wordsintext(); + this.wordsintext = this.wordsintext + oe.wordsintext; + this.termFrequency = this.termFrequency + oe.termFrequency; } public kelondroBitfield flags() { @@ -191,66 +262,65 @@ public class indexRWIVarEntry implements indexRWIEntry { return this.termFrequency; } - public static final void min(indexRWIVarEntry t, indexRWIVarEntry other) { + public final void min(indexRWIVarEntry other) { int v; long w; double d; - if (t.hitcount() > (v = other.hitcount())) t.hitcount = v; - if (t.llocal() > (v = other.llocal())) t.llocal = v; - if (t.lother() > (v = other.lother())) t.lother = v; - if (t.virtualAge() > (v = other.virtualAge())) t.virtualAge = v; - if (t.wordsintext() > (v = other.wordsintext())) t.wordsintext = v; - if (t.phrasesintext() > (v = other.phrasesintext())) t.phrasesintext = v; - if (t.posintext() > (v = other.posintext())) t.posintext = v; - if (t.posinphrase() > (v = other.posinphrase())) t.posinphrase = v; - if (t.posofphrase() > (v = other.posofphrase())) t.posofphrase = v; - if (t.worddistance() > (v = other.worddistance())) t.worddistance = v; - if (t.lastModified() > (w = other.lastModified())) t.lastModified = w; - if (t.freshUntil() > (w = other.freshUntil())) t.freshUntil = w; - if (t.urllength() > (v = other.urllength())) t.urllength = v; - if (t.urlcomps() > (v = other.urlcomps())) t.urlcomps = v; - if (t.wordsintitle() > (v = other.wordsintitle())) t.wordsintitle = v; - if (t.termFrequency > (d = other.termFrequency())) t.termFrequency = d; + if (this.hitcount > (v = other.hitcount)) this.hitcount = v; + if (this.llocal > (v = other.llocal)) this.llocal = v; + if (this.lother > (v = other.lother)) this.lother = v; + if (this.virtualAge > (v = other.virtualAge)) this.virtualAge = v; + if (this.wordsintext > (v = other.wordsintext)) this.wordsintext = v; + if (this.phrasesintext > (v = other.phrasesintext)) this.phrasesintext = v; + if (this.posintext > (v = other.posintext)) this.posintext = v; + if (this.posinphrase > (v = other.posinphrase)) this.posinphrase = v; + if (this.posofphrase > (v = other.posofphrase)) this.posofphrase = v; + if (this.worddistance > (v = other.worddistance)) this.worddistance = v; + if (this.lastModified > (w = other.lastModified)) this.lastModified = w; + if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w; + if (this.urllength > (v = other.urllength)) this.urllength = v; + if (this.urlcomps > (v = other.urlcomps)) this.urlcomps = v; + if (this.wordsintitle > (v = other.wordsintitle)) this.wordsintitle = v; + if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d; } - public static final void max(indexRWIVarEntry t, indexRWIVarEntry other) { + public final void max(indexRWIVarEntry other) { int v; long w; double d; - if (t.hitcount() < (v = other.hitcount())) t.hitcount = v; - if (t.llocal() < (v = other.llocal())) t.llocal = v; - if (t.lother() < (v = other.lother())) t.lother = v; - if (t.virtualAge() < (v = other.virtualAge())) t.virtualAge = v; - if (t.wordsintext() < (v = other.wordsintext())) t.wordsintext = v; - if (t.phrasesintext() < (v = other.phrasesintext())) t.phrasesintext = v; - if (t.posintext() < (v = other.posintext())) t.posintext = v; - if (t.posinphrase() < (v = other.posinphrase())) t.posinphrase = v; - if (t.posofphrase() < (v = other.posofphrase())) t.posofphrase = v; - if (t.worddistance() < (v = other.worddistance())) t.worddistance = v; - if (t.lastModified() < (w = other.lastModified())) t.lastModified = w; - if (t.freshUntil() < (w = other.freshUntil())) t.freshUntil = w; - if (t.urllength() < (v = other.urllength())) t.urllength = v; - if (t.urlcomps() < (v = other.urlcomps())) t.urlcomps = v; - if (t.wordsintitle() < (v = other.wordsintitle())) t.wordsintitle = v; - if (t.termFrequency < (d = other.termFrequency())) t.termFrequency = d; + if (this.hitcount < (v = other.hitcount)) this.hitcount = v; + if (this.llocal < (v = other.llocal)) this.llocal = v; + if (this.lother < (v = other.lother)) this.lother = v; + if (this.virtualAge < (v = other.virtualAge)) this.virtualAge = v; + if (this.wordsintext < (v = other.wordsintext)) this.wordsintext = v; + if (this.phrasesintext < (v = other.phrasesintext)) this.phrasesintext = v; + if (this.posintext < (v = other.posintext)) this.posintext = v; + if (this.posinphrase < (v = other.posinphrase)) this.posinphrase = v; + if (this.posofphrase < (v = other.posofphrase)) this.posofphrase = v; + if (this.worddistance < (v = other.worddistance)) this.worddistance = v; + if (this.lastModified < (w = other.lastModified)) this.lastModified = w; + if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w; + if (this.urllength < (v = other.urllength)) this.urllength = v; + if (this.urlcomps < (v = other.urlcomps)) this.urlcomps = v; + if (this.wordsintitle < (v = other.wordsintitle)) this.wordsintitle = v; + if (this.termFrequency < (d = other.termFrequency)) this.termFrequency = d; } - public static void join(indexRWIVarEntry ie1, indexRWIEntry ie2) { - // returns a modified entry of the first argument + public void join(indexRWIEntry oe) { + // joins two entries into one entry // combine the distance - ie1.worddistance = ie1.worddistance + ((ie2 instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) ie2).worddistance() : 0) + Math.abs(ie1.posintext() - ie2.posintext()); - ie1.posintext = Math.min(ie1.posintext(), ie2.posintext()); - ie1.posinphrase = (ie1.posofphrase() == ie2.posofphrase()) ? Math.min(ie1.posinphrase(), ie2.posinphrase()) : 0; - ie1.posofphrase = Math.min(ie1.posofphrase(), ie2.posofphrase()); + this.worddistance = this.worddistance + ((oe instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) oe).worddistance : 0) + Math.abs(this.posintext() - oe.posintext()); + this.posintext = Math.min(this.posintext, oe.posintext()); + this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0; + this.posofphrase = Math.min(this.posofphrase, oe.posofphrase()); // combine term frequency - ie1.termFrequency = ie1.termFrequency + ie2.termFrequency(); - ie1.wordsintext = ie1.wordsintext() + ie2.wordsintext(); - } - - public void join(indexRWIEntry oe) { - join(this, oe); + this.termFrequency = this.termFrequency + oe.termFrequency(); + this.wordsintext = this.wordsintext + oe.wordsintext(); } + public int hashCode() { + return this.urlHash.hashCode(); + } } diff --git a/source/de/anomic/kelondro/kelondroSortStack.java b/source/de/anomic/kelondro/kelondroSortStack.java new file mode 100644 index 000000000..126846949 --- /dev/null +++ b/source/de/anomic/kelondro/kelondroSortStack.java @@ -0,0 +1,147 @@ +// kelondroSortStack.java +// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 20.02.2008 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +public class kelondroSortStack { + + // implements a stack where elements 'float' on-top of the stack according to a weight value. + // objects pushed on the stack must implement the hashCode() method to provide a handle + // for a double-check. + + protected TreeMap onstack; // object within the stack + protected HashSet instack; // keeps track which element has been on the stack or is now in the offstack + protected int maxsize; + + public kelondroSortStack(int maxsize) { + // the maxsize is the maximum number of entries in the stack + // if this is set to -1, the size is unlimited + this.onstack = new TreeMap(); + this.instack = new HashSet(); + this.maxsize = maxsize; + } + + public int size() { + return this.onstack.size(); + } + + public synchronized void push(stackElement se) { + push(se.element, se.weight); + } + + public synchronized void push(E element, long weight) { + if (exists(element)) return; + + // manipulate weight in such a way that it has no conflicts + Long w = new Long(weight); + while (this.onstack.containsKey(w)) w = new Long(w.longValue() + 1); + + // put the element on the stack + this.onstack.put(w, element); + + // register it for double-check + this.instack.add(element.hashCode()); + + // check maximum size of the stack an remove elements if the stack gets too large + if (this.maxsize <= 0) return; + while ((this.onstack.size() > 0) && (this.onstack.size() > this.maxsize)) { + this.onstack.remove(this.onstack.lastKey()); + } + } + + public synchronized stackElement top() { + // returns the element that is currently on top of the stack + if (this.onstack.size() == 0) return null; + Long w = this.onstack.firstKey(); + E element = this.onstack.get(w); + return new stackElement(element, w.longValue()); + } + + public synchronized stackElement pop() { + // returns the element that is currently on top of the stack + // it is removed and added to the offstack list + // this is exactly the same as element(offstack.size()) + if (this.onstack.size() == 0) return null; + Long w = this.onstack.firstKey(); + E element = this.onstack.remove(w); + stackElement se = new stackElement(element, w.longValue()); + return se; + } + + public boolean exists(E element) { + // uses the hashCode of the element to find out of the element had been on the list or the stack + return this.instack.contains(element.hashCode()); + } + + public boolean exists(int hashcode) { + // uses the hashCode of the element to find out of the element had been on the list or the stack + return this.instack.contains(hashcode); + } + + public stackElement get(int hashcode) { + Iterator> i = this.onstack.entrySet().iterator(); + Map.Entry entry; + while (i.hasNext()) { + entry = i.next(); + if (entry.getValue().hashCode() == hashcode) return new stackElement(entry.getValue(), entry.getKey().longValue()); + } + return null; + } + + public stackElement remove(int hashcode) { + Iterator> i = this.onstack.entrySet().iterator(); + Map.Entry entry; + stackElement se; + while (i.hasNext()) { + entry = i.next(); + if (entry.getValue().hashCode() == hashcode) { + se = new stackElement(entry.getValue(), entry.getKey().longValue()); + this.onstack.remove(se.weight); + return se; + } + } + return null; + } + + public boolean bottom(long weight) { + // returns true if the element with that weight would be on the bottom of the stack after inserting + return weight > this.onstack.lastKey().longValue(); + } + + public class stackElement { + public long weight; + public E element; + public stackElement(E element, long weight) { + this.element = element; + this.weight = weight; + } + } +} diff --git a/source/de/anomic/kelondro/kelondroSortStore.java b/source/de/anomic/kelondro/kelondroSortStore.java new file mode 100644 index 000000000..2326a7495 --- /dev/null +++ b/source/de/anomic/kelondro/kelondroSortStore.java @@ -0,0 +1,135 @@ +// kelondroSortStore.java +// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 20.02.2008 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro; + +import java.util.ArrayList; +import java.util.Iterator; + +public class kelondroSortStore extends kelondroSortStack { + + // extends the sortStack in such a way that it adds a list where objects, that had + // been pulled from the stack with pop are listed. Provides access methods to address + // specific elements in the list. + + private ArrayList offstack; // objects that had been on the stack but had been removed + + public kelondroSortStore(int maxsize) { + super(maxsize); + this.offstack = new ArrayList(); + } + + public int size() { + return super.onstack.size() + this.offstack.size(); + } + + public int sizeStore() { + return this.offstack.size(); + } + + public synchronized void push(E element, long weight) { + super.push(element, weight); + if (this.maxsize <= 0) return; + while ((this.onstack.size() > 0) && (super.onstack.size() + this.offstack.size() > this.maxsize)) { + this.onstack.remove(this.onstack.lastKey()); + } + } + + public synchronized stackElement pop() { + // returns the element that is currently on top of the stack + // it is removed and added to the offstack list + // this is exactly the same as element(offstack.size()) + stackElement se = super.pop(); + if (se == null) return null; + this.offstack.add(se); + return se; + } + + public synchronized stackElement element(int position) { + // returns an element from a specific position. It is either taken from the offstack, + // or removed from the onstack. + // The offstack will grow if elements are not from the offstack and present at the onstack. + if (position < this.offstack.size()) { + return this.offstack.get(position); + } + if (position >= size()) return null; // we don't have that element + while (position >= this.offstack.size()) { + Long w = this.onstack.firstKey(); + E element = this.onstack.remove(w); + stackElement se = new stackElement(element, w.longValue()); + this.offstack.add(se); + } + return this.offstack.get(position); + } + + public ArrayList list(int count) { + // returns the specific amount of entries. If they are not yet present in the offstack, they are shifted there from the onstack + // if count is < 0 then all elements are taken + // the returned list is not cloned from the internal list and shall not be modified in any way (read-only) + if (count < 0) { + // shift all elements + while (this.onstack.size() > 0) { + Long w = this.onstack.firstKey(); + E element = this.onstack.remove(w); + stackElement se = new stackElement(element, w.longValue()); + this.offstack.add(se); + } + return this.offstack; + } + if (size() < count) throw new RuntimeException("list(" + count + ") exceeded avaiable number of elements (" + size() + ")"); + while (this.onstack.size() < count) { + Long w = this.onstack.firstKey(); + E element = this.onstack.remove(w); + stackElement se = new stackElement(element, w.longValue()); + this.offstack.add(se); + } + return this.offstack; + } + + public stackElement get(int hashcode) { + stackElement se = super.get(hashcode); + if (se != null) return se; + Iterator j = this.offstack.iterator(); + while (j.hasNext()) { + se = j.next(); + if (se.element.hashCode() == hashcode) return se; + } + return null; + } + + public stackElement remove(int hashcode) { + stackElement se = super.remove(hashcode); + if (se != null) return se; + for (int j = 0; j < this.offstack.size(); j++) { + se = this.offstack.get(j); + if (se.element.hashCode() == hashcode) { + this.offstack.remove(j); + return se; + } + } + return null; + } +} diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 69f865a30..f5db66c65 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -38,9 +38,12 @@ import java.util.TreeSet; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIVarEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMSetTools; +import de.anomic.kelondro.kelondroSortStack; +import de.anomic.kelondro.kelondroSortStore; import de.anomic.plasma.plasmaSnippetCache.MediaSnippet; import de.anomic.server.serverProfiling; import de.anomic.server.logging.serverLog; @@ -77,8 +80,7 @@ public final class plasmaSearchEvent { public TreeMap IACount; public String IAmaxcounthash, IAneardhthash; private resultWorker[] workerThreads; - private ArrayList resultList; - //private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again + private kelondroSortStore result; private HashMap failedURLs; // a mapping from a urlhash to a fail reason string TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets private long urlRetrievalAllTime; @@ -104,8 +106,7 @@ public final class plasmaSearchEvent { this.snippetComputationAllTime = 0; this.workerThreads = null; this.localSearchThread = null; - this.resultList = new ArrayList(10); // this is the result set which is filled up with search results, enriched with snippets - //this.resultListLock = 0; // no locked elements until now + this.result = new kelondroSortStore(-1); // this is the result, enriched with snippets, ranked and ordered by ranking this.failedURLs = new HashMap(); // a map of urls to reason strings where a worker thread tried to work on, but failed. // snippets do not need to match with the complete query hashes, @@ -202,7 +203,7 @@ public final class plasmaSearchEvent { ResultEntry resultEntry; yacyURL url; synchronized (rankedCache) { - while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (resultList.size() < (query.neededResults()))) { + while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (result.size() < (query.neededResults()))) { url = uentry.comp().url(); if (url == null) continue; //System.out.println("***DEBUG*** SEARCH RESULT URL=" + url.toNormalform(false, false)); @@ -213,9 +214,7 @@ public final class plasmaSearchEvent { snippetComputationAllTime += resultEntry.snippetComputationTime; // place the result to the result vector - synchronized (resultList) { - resultList.add(resultEntry); - } + result.push(resultEntry, rankedCache.getOrder().cardinal(resultEntry.word())); // add references synchronized (rankedCache) { @@ -223,7 +222,7 @@ public final class plasmaSearchEvent { } } } - serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "offline snippet fetch", resultList.size(), System.currentTimeMillis() - timer)); + serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "offline snippet fetch", result.size(), System.currentTimeMillis() - timer)); } // clean up events @@ -466,8 +465,8 @@ public final class plasmaSearchEvent { // if worker threads had been alive, but did not succeed, start them again to fetch missing links if ((query.onlineSnippetFetch) && (!event.anyWorkerAlive()) && - (event.resultList.size() < query.neededResults() + 10) && - ((event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize()) > event.resultList.size())) { + (event.result.size() < query.neededResults() + 10) && + (event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.result.size())) { // set new timeout event.eventTime = System.currentTimeMillis(); // start worker threads to fetch urls and snippets @@ -508,7 +507,7 @@ public final class plasmaSearchEvent { while (System.currentTimeMillis() < this.timeout) { this.lastLifeSign = System.currentTimeMillis(); - if (resultList.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough + if (result.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough // get next entry page = rankedCache.bestURL(true); @@ -531,21 +530,8 @@ public final class plasmaSearchEvent { //System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring()); // place the result to the result vector - boolean d = false; - synchronized (resultList) { - doublecheck: for (int i = 0; i < resultList.size(); i++) { - if (resultList.get(i).urlcomps.url().hash().equals(resultEntry.urlcomps.url().hash())) { - d = true; - break doublecheck; - } - } - if (!d) { - resultList.add(resultEntry); - } - } - - // add references - if (!d) synchronized (rankedCache) { + if (!result.exists(resultEntry)) { + result.push(resultEntry, rankedCache.getOrder().cardinal(resultEntry.word())); rankedCache.addReferences(resultEntry); } //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url()); @@ -554,10 +540,7 @@ public final class plasmaSearchEvent { } private boolean anyResultWith(String urlhash) { - for (int i = 0; i < resultList.size(); i++) { - if (((ResultEntry) resultList.get(i)).urlentry.hash().equals(urlhash)) return true; - } - return false; + return result.exists(urlhash.hashCode()); } private boolean anyFailureWith(String urlhash) { @@ -576,6 +559,11 @@ public final class plasmaSearchEvent { public ResultEntry oneResult(int item) { // first sleep a while to give accumulation threads a chance to work + if (this.result.sizeStore() > item) { + // we have the wanted result already in the result array .. return that + return this.result.element(item).element; + } + if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) || (query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) { // this is a search using remote search threads. Also the local search thread is started as background process @@ -586,45 +574,28 @@ public final class plasmaSearchEvent { } // now wait until as many remote worker threads have finished, as we want to display results while ((this.primarySearchThreads != null) && (this.primarySearchThreads.length > item) && (anyWorkerAlive()) && - ((this.resultList.size() <= item) || (countFinishedRemoteSearch() <= item))) { + ((result.size() <= item) || (countFinishedRemoteSearch() <= item))) { try {Thread.sleep(100);} catch (InterruptedException e) {} } } // finally wait until enough results are there produced from the snippet fetch process - while ((anyWorkerAlive()) && (this.resultList.size() <= item)) { + while ((anyWorkerAlive()) && (result.size() <= item)) { try {Thread.sleep(100);} catch (InterruptedException e) {} } // finally, if there is something, return the result - synchronized (this.resultList) { - // check if we have enough entries - if (this.resultList.size() <= item) return null; - - // fetch the best entry from the resultList, not the entry from item position - // whenever a specific entry was switched in its position and was returned here - // a moving pointer is set to assign that item position as not changeable - int bestpick = item; //postRankingFavourite(item); - if (bestpick != item) { - // switch the elements - ResultEntry buf = (ResultEntry) this.resultList.get(bestpick); - serverLog.logInfo("SEARCH_POSTRANKING", "prefering [" + bestpick + "] " + buf.urlstring() + " over [" + item + "] " + ((ResultEntry) this.resultList.get(item)).urlstring()); - this.resultList.set(bestpick, (ResultEntry) this.resultList.get(item)); - this.resultList.set(item, buf); - } - - //this.resultListLock = item; // lock the element; be prepared to return it - return (ResultEntry) this.resultList.get(item); - } + if (this.result.size() <= item) return null; + return this.result.element(item).element; } - - public ArrayList completeResults(long waitingtime) { + + public ArrayList.stackElement> completeResults(long waitingtime) { long timeout = System.currentTimeMillis() + waitingtime; - while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) { + while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) { try {Thread.sleep(100);} catch (InterruptedException e) {} //System.out.println("+++DEBUG-completeResults+++ sleeping " + 200); } - return this.resultList; + return this.result.list(this.result.size()); } boolean secondarySearchStartet = false; @@ -789,7 +760,9 @@ public final class plasmaSearchEvent { if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p); } } - + public int hashCode() { + return urlentry.hash().hashCode(); + } public String hash() { return urlentry.hash(); } @@ -832,8 +805,10 @@ public final class plasmaSearchEvent { public int lapp() { return urlentry.lapp(); } - public indexRWIEntry word() { - return urlentry.word(); + public indexRWIVarEntry word() { + indexRWIEntry word = urlentry.word(); + assert word instanceof indexRWIVarEntry; + return (indexRWIVarEntry) word; } public boolean hasTextSnippet() { return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11); diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index a1283d464..64c7ef740 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -33,7 +33,6 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; -import java.util.TreeMap; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -45,6 +44,7 @@ import de.anomic.index.indexRWIVarEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroMScoreCluster; +import de.anomic.kelondro.kelondroSortStack; import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; import de.anomic.server.serverProfiling; @@ -54,15 +54,15 @@ public final class plasmaSearchRankingProcess { public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; - private TreeMap sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String - private HashMap> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries + private kelondroSortStack stack; + private HashMap> doubleDomCache; // key = domhash (6 bytes); value = like stack private HashMap handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process private plasmaSearchQuery query; private int sortorder; private int maxentries; private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize; private indexRWIEntryOrder order; - private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) + private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic private int[] flagcount; // flag counter private TreeSet misses; // contains url-hashes that could not been found in the LURL-DB @@ -74,17 +74,17 @@ public final class plasmaSearchRankingProcess { // attention: if minEntries is too high, this method will not terminate within the maxTime // sortorder: 0 = hash, 1 = url, 2 = ranking this.localSearchContainerMaps = null; - this.sortedRWIEntries = new TreeMap(); - this.doubleDomCache = new HashMap>(); + this.stack = new kelondroSortStack(maxentries); + this.doubleDomCache = new HashMap>(); this.handover = new HashMap(); - this.order = null; + this.order = (query == null) ? null : new indexRWIEntryOrder(query.ranking); this.query = query; this.maxentries = maxentries; this.remote_peerCount = 0; this.remote_indexCount = 0; this.remote_resourceSize = 0; this.local_resourceSize = 0; - this.urlhashes = new HashMap(); + this.urlhashes = new HashMap(); this.ref = new kelondroMScoreCluster(); this.misses = new TreeSet(); this.wordIndex = wordIndex; @@ -93,6 +93,10 @@ public final class plasmaSearchRankingProcess { for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} } + public long ranking(indexRWIVarEntry word) { + return order.cardinal(word); + } + public void execQuery() { long timer = System.currentTimeMillis(); @@ -150,21 +154,21 @@ public final class plasmaSearchRankingProcess { // load url if (sortorder == 0) { - this.sortedRWIEntries.put(ientry.urlHash(), ientry); - this.urlhashes.put(ientry.urlHash(), ientry.urlHash()); + this.stack.push(ientry, ientry.urlHash().hashCode()); + this.urlhashes.put(ientry.urlHash(), ientry.urlHash().hashCode()); } else { uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0); if (uentry == null) { this.misses.add(ientry.urlHash()); } else { u = uentry.comp().url().toNormalform(false, true); - this.sortedRWIEntries.put(u, ientry); - this.urlhashes.put(ientry.urlHash(), u); + this.stack.push(ientry, u.hashCode()); + this.urlhashes.put(ientry.urlHash(), u.hashCode()); } } // interrupt if we have enough - if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop; + if ((query.neededResults() > 0) && (this.misses.size() + this.stack.size() > query.neededResults())) break loop; } // end loop } @@ -182,22 +186,20 @@ public final class plasmaSearchRankingProcess { } long timer = System.currentTimeMillis(); - if (this.order == null) { - this.order = new indexRWIEntryOrder(query.ranking); - } + + // normalize entries ArrayList decodedEntries = this.order.normalizeWith(index); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer)); - // normalize entries and get ranking + // iterate over normalized entries and select some that are better than currently stored timer = System.currentTimeMillis(); Iterator i = decodedEntries.iterator(); - indexRWIVarEntry iEntry, l; - long biggestEntry = 0; - //long s0 = System.currentTimeMillis(); + indexRWIVarEntry iEntry; Long r; while (i.hasNext()) { iEntry = i.next(); - if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; + assert (iEntry.urlHash().length() == index.row().primaryKeyLength); + //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; // increase flag counts for (int j = 0; j < 32; j++) { @@ -206,31 +208,32 @@ public final class plasmaSearchRankingProcess { // kick out entries that are too bad according to current findings r = new Long(order.cardinal(iEntry)); - if ((maxentries >= 0) && (sortedRWIEntries.size() >= maxentries) && (r.longValue() > biggestEntry)) continue; + if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue; // check constraints if (!testFlags(iEntry)) continue; + // check document domain if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; } - if ((maxentries < 0) || (sortedRWIEntries.size() < maxentries)) { + + // insert + if ((maxentries < 0) || (stack.size() < maxentries)) { + // in case that we don't have enough yet, accept any new entry if (urlhashes.containsKey(iEntry.urlHash())) continue; - while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1); - sortedRWIEntries.put(r, iEntry); + stack.push(iEntry, r); } else { - if (r.longValue() > biggestEntry) { + // if we already have enough entries, insert only such that are necessary to get a better result + if (stack.bottom(r.longValue())) { continue; } else { + // double-check if (urlhashes.containsKey(iEntry.urlHash())) continue; - l = sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey()); - urlhashes.remove(l.urlHash()); - while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1); - sortedRWIEntries.put(r, iEntry); - biggestEntry = order.cardinal(sortedRWIEntries.get(sortedRWIEntries.lastKey())); + stack.push(iEntry, r); } } @@ -271,85 +274,69 @@ public final class plasmaSearchRankingProcess { // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name - private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) { + private synchronized kelondroSortStack.stackElement bestRWI(boolean skipDoubleDom) { // returns from the current RWI list the best entry and removed this entry from the list - Object bestEntry; - TreeMap m; - indexRWIVarEntry rwi; - while (sortedRWIEntries.size() > 0) { - bestEntry = sortedRWIEntries.firstKey(); - rwi = sortedRWIEntries.remove(bestEntry); - if (!skipDoubleDom) return new Object[]{bestEntry, rwi}; + kelondroSortStack m; + kelondroSortStack.stackElement rwi; + while (stack.size() > 0) { + rwi = stack.pop(); + if (!skipDoubleDom) return rwi; // check doubledom - String domhash = rwi.urlHash().substring(6); + String domhash = rwi.element.urlHash().substring(6); m = this.doubleDomCache.get(domhash); if (m == null) { // first appearance of dom - m = new TreeMap(); + m = new kelondroSortStack(-1); this.doubleDomCache.put(domhash, m); - return new Object[]{bestEntry, rwi}; + return rwi; } // second appearances of dom - m.put(bestEntry, rwi); + m.push(rwi); } // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache // find best entry from all caches - Iterator> i = this.doubleDomCache.values().iterator(); - bestEntry = null; - Object o; - indexRWIVarEntry bestrwi = null; + Iterator> i = this.doubleDomCache.values().iterator(); + kelondroSortStack.stackElement bestEntry = null; + kelondroSortStack.stackElement o; while (i.hasNext()) { m = i.next(); if (m.size() == 0) continue; if (bestEntry == null) { - bestEntry = m.firstKey(); - bestrwi = m.remove(bestEntry); + bestEntry = m.top(); continue; } - o = m.firstKey(); - rwi = m.remove(o); - if (o instanceof Long) { - if (((Long) o).longValue() < ((Long) bestEntry).longValue()) { - bestEntry = o; - bestrwi = rwi; - } - } - if (o instanceof String) { - if (((String) o).compareTo((String) bestEntry) < 0) { - bestEntry = o; - bestrwi = rwi; - } + o = m.top(); + if (o.weight < bestEntry.weight) { + bestEntry = o; } } - if (bestrwi == null) return null; + if (bestEntry == null) return null; // finally remove the best entry from the doubledom cache - m = this.doubleDomCache.get(bestrwi.urlHash().substring(6)); - m.remove(bestEntry); - return new Object[]{bestEntry, bestrwi}; + m = this.doubleDomCache.get(bestEntry.element.urlHash().substring(6)); + o = m.pop(); + assert o.element.urlHash().equals(bestEntry.element.urlHash()); + return bestEntry; } public synchronized indexURLEntry bestURL(boolean skipDoubleDom) { // returns from the current RWI list the best URL entry and removed this entry from the list - while ((sortedRWIEntries.size() > 0) || (size() > 0)) { - Object[] obrwi = bestRWI(skipDoubleDom); - Object bestEntry = obrwi[0]; - indexRWIVarEntry ientry = (indexRWIVarEntry) obrwi[1]; - long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0; - indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking); + while ((stack.size() > 0) || (size() > 0)) { + kelondroSortStack.stackElement obrwi = bestRWI(skipDoubleDom); + indexURLEntry u = wordIndex.loadedURL.load(obrwi.element.urlHash(), obrwi.element, obrwi.weight); if (u != null) { indexURLEntry.Components comp = u.comp(); if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url return u; } - misses.add(ientry.urlHash()); + misses.add(obrwi.element.urlHash()); } return null; } public synchronized int size() { //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); - int c = sortedRWIEntries.size(); - Iterator> i = this.doubleDomCache.values().iterator(); + int c = stack.size(); + Iterator> i = this.doubleDomCache.values().iterator(); while (i.hasNext()) c += i.next().size(); return c; } @@ -362,7 +349,7 @@ public final class plasmaSearchRankingProcess { public int filteredCount() { // the number of index entries that are considered as result set - return this.sortedRWIEntries.size(); + return this.stack.size(); } public int getRemoteIndexCount() { @@ -385,14 +372,11 @@ public final class plasmaSearchRankingProcess { return this.local_resourceSize; } - public indexRWIEntry remove(String urlHash) { - Object r = (Long) urlhashes.get(urlHash); - if (r == null) return null; - assert sortedRWIEntries.containsKey(r); - indexRWIEntry iEntry = (indexRWIEntry) sortedRWIEntries.remove(r); + kelondroSortStack.stackElement se = stack.remove(urlHash.hashCode()); + if (se == null) return null; urlhashes.remove(urlHash); - return iEntry; + return se.element; } public Iterator miss() { diff --git a/yacy.network.group b/yacy.network.group index 2bf87d354..2b1d57746 100644 --- a/yacy.network.group +++ b/yacy.network.group @@ -1,7 +1,7 @@ # YaCy Network Group Definition # ----------------------------- # This is an addition to the yacy.network.unit configuration file. -# This file is adressed by the network.group.definition property in yacy.init +# This file is addressed by the network.group.definition property in yacy.init # The purpose of a group within a network is that some parts of a network may be managed independently, # while the content of the network stays private for the whole network, mostly for a special purpose. # This file needs to be configured if someone wants to participate with several peers to the network,