- fixed some bugs in ranking computation

- introduced generalized method to organize ranked results (2 new classes)
- added a post-ranking after snippet-fetch (before: only listed) using the new ranking data structures
- fixed some missing data fields in RWI ranking attributes and correct hand-over between data structures

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4498 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent f4c73d8c68
commit 727feb4358

@ -38,6 +38,7 @@ import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroSortStack;
import de.anomic.net.natLib; import de.anomic.net.natLib;
import de.anomic.plasma.plasmaProfiling; import de.anomic.plasma.plasmaProfiling;
import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchEvent;
@ -148,7 +149,7 @@ public final class search {
int indexabstractContainercount = 0; int indexabstractContainercount = 0;
int joincount = 0; int joincount = 0;
plasmaSearchQuery theQuery = null; plasmaSearchQuery theQuery = null;
ArrayList<ResultEntry> accu = null; ArrayList<kelondroSortStack<ResultEntry>.stackElement> accu = null;
plasmaSearchEvent theSearch = null; plasmaSearchEvent theSearch = null;
if ((query.length() == 0) && (abstractSet != null)) { if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts // this is _not_ a normal search, only a request for index abstracts
@ -258,10 +259,10 @@ public final class search {
long timer = System.currentTimeMillis(); long timer = System.currentTimeMillis();
StringBuffer links = new StringBuffer(); StringBuffer links = new StringBuffer();
String resource = null; String resource = null;
plasmaSearchEvent.ResultEntry entry; kelondroSortStack<plasmaSearchEvent.ResultEntry>.stackElement entry;
for (int i = 0; i < accu.size(); i++) { for (int i = 0; i < accu.size(); i++) {
entry = (plasmaSearchEvent.ResultEntry) accu.get(i); entry = accu.get(i);
resource = entry.resource(); resource = entry.element.resource();
if (resource != null) { if (resource != null) {
links.append("resource").append(i).append('=').append(resource).append(serverCore.CRLF_STRING); links.append("resource").append(i).append('=').append(resource).append(serverCore.CRLF_STRING);
} }

@ -86,4 +86,5 @@ public interface indexRWIEntry {
public boolean isOlder(indexRWIEntry other); public boolean isOlder(indexRWIEntry other);
public int hashCode();
} }

@ -31,16 +31,14 @@ import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import de.anomic.kelondro.kelondroAbstractOrder;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroOrder;
import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry> implements kelondroOrder<indexRWIVarEntry> { public class indexRWIEntryOrder {
private indexRWIVarEntry min, max; private indexRWIVarEntry min, max;
private plasmaSearchRankingProfile ranking; private plasmaSearchRankingProfile ranking;
private kelondroMScoreCluster<String> doms; // collected for "authority" heuristic private kelondroMScoreCluster<String> doms; // collected for "authority" heuristic
@ -69,8 +67,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
mmf0.start(); // fork here mmf0.start(); // fork here
minmaxfinder mmf1 = new minmaxfinder(container, middle, container.size()); minmaxfinder mmf1 = new minmaxfinder(container, middle, container.size());
mmf1.run(); // execute other fork in this thread mmf1.run(); // execute other fork in this thread
if (this.min == null) this.min = mmf1.entryMin; else indexRWIVarEntry.min(this.min, mmf1.entryMin); if (this.min == null) this.min = mmf1.entryMin.clone(); else this.min.min(mmf1.entryMin);
if (this.max == null) this.max = mmf1.entryMax; else indexRWIVarEntry.max(this.max, mmf1.entryMax); if (this.max == null) this.max = mmf1.entryMax.clone(); else this.max.max(mmf1.entryMax);
Map.Entry<String, Integer> entry; Map.Entry<String, Integer> entry;
Iterator<Map.Entry<String, Integer>> di = mmf1.domcount().entrySet().iterator(); Iterator<Map.Entry<String, Integer>> di = mmf1.domcount().entrySet().iterator();
while (di.hasNext()) { while (di.hasNext()) {
@ -78,8 +76,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue()); this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue());
} }
try {mmf0.join();} catch (InterruptedException e) {} // wait for fork thread to finish try {mmf0.join();} catch (InterruptedException e) {} // wait for fork thread to finish
if (this.min == null) this.min = mmf0.entryMin; else indexRWIVarEntry.min(this.min, mmf0.entryMin); if (this.min == null) this.min = mmf0.entryMin.clone(); else this.min.min(mmf0.entryMin);
if (this.max == null) this.max = mmf0.entryMax; else indexRWIVarEntry.max(this.max, mmf0.entryMax); if (this.max == null) this.max = mmf0.entryMax.clone(); else this.max.max(mmf0.entryMax);
di = mmf0.domcount().entrySet().iterator(); di = mmf0.domcount().entrySet().iterator();
while (di.hasNext()) { while (di.hasNext()) {
entry = di.next(); entry = di.next();
@ -93,8 +91,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
// run minmax in one thread // run minmax in one thread
minmaxfinder mmf = new minmaxfinder(container, 0, container.size()); minmaxfinder mmf = new minmaxfinder(container, 0, container.size());
mmf.run(); // execute without multi-threading mmf.run(); // execute without multi-threading
if (this.min == null) this.min = mmf.entryMin; else indexRWIVarEntry.min(this.min, mmf.entryMin); if (this.min == null) this.min = mmf.entryMin.clone(); else this.min.min(mmf.entryMin);
if (this.max == null) this.max = mmf.entryMax; else indexRWIVarEntry.max(this.max, mmf.entryMax); if (this.max == null) this.max = mmf.entryMax.clone(); else this.max.max(mmf.entryMax);
Map.Entry<String, Integer> entry; Map.Entry<String, Integer> entry;
Iterator<Map.Entry<String, Integer>> di = mmf.domcount().entrySet().iterator(); Iterator<Map.Entry<String, Integer>> di = mmf.domcount().entrySet().iterator();
while (di.hasNext()) { while (di.hasNext()) {
@ -109,44 +107,34 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
return result; return result;
} }
public kelondroOrder<indexRWIVarEntry> clone() {
return null;
}
public int authority(String urlHash) { public int authority(String urlHash) {
return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount); return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount);
} }
public long cardinal(byte[] key) {
return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key)));
}
public long cardinal(indexRWIRowEntry t) {
return cardinal(new indexRWIVarEntry(t));
}
public long cardinal(indexRWIVarEntry t) { public long cardinal(indexRWIVarEntry t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords); //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry // the normalizedEntry must be a normalized indexEntry
kelondroBitfield flags = t.flags(); kelondroBitfield flags = t.flags();
long tf = ((max.termFrequency() == min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-min.termFrequency())*256.0)/(max.termFrequency() - min.termFrequency())))) << ranking.coeff_termfrequency);
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
long r = long r =
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength) ((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
+ ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr) + ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr)
+ ((t.urlcomps() == 0) ? 0 : ((256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)) + ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
+ ((t.urllength() == 0) ? 0 : ((256 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength)) + ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
+ ((t.posintext() == 0) ? 0 : ((256 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext)) + ((max.posintext() == min.posintext() ) ? 0 : (256 - (((t.posintext() - min.posintext() ) << 8) / (max.posintext() - min.posintext()) )) << ranking.coeff_posintext)
+ ((t.posofphrase() == 0) ? 0 : ((256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)) + ((max.posofphrase() == min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)
+ ((t.posinphrase() == 0) ? 0 : ((256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)) + ((max.posinphrase() == min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)
+ ((256 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance) + ((max.worddistance() == min.worddistance()) ? 0 : (256 - (((t.worddistance() - min.worddistance() ) << 8) / (max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance)
+ ( (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date) + ((max.virtualAge() == min.virtualAge()) ? 0 : (((t.virtualAge() - min.virtualAge() ) << 8) / (max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date)
+ ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle) + ((max.wordsintitle() == min.wordsintitle()) ? 0 : (((t.wordsintitle() - min.wordsintitle() ) << 8) / (max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle)
+ ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext) + ((max.wordsintext() == min.wordsintext()) ? 0 : (((t.wordsintext() - min.wordsintext() ) << 8) / (max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext)
+ ( (((t.phrasesintext()- min.phrasesintext() ) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext) + ((max.phrasesintext() == min.phrasesintext()) ? 0 : (((t.phrasesintext()- min.phrasesintext() ) << 8) / (max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext)
+ ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal) + ((max.llocal() == min.llocal()) ? 0 : (((t.llocal() - min.llocal() ) << 8) / (max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
+ ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother) + ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ (((int)((((t.termFrequency()- min.termFrequency() )*256.0)/ (1 + max.termFrequency()- min.termFrequency()))))<< ranking.coeff_termfrequency) + tf
+ ( authority(t.urlHash()) << ranking.coeff_authority) + (authority(t.urlHash()) << ranking.coeff_authority)
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)) + (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)) + (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)) + (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0))
@ -163,20 +151,6 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
} }
public int compare(indexRWIVarEntry a, indexRWIVarEntry b) {
long ca = cardinal(a);
long cb = cardinal(b);
return (ca > cb) ? 1 : (ca < cb) ? -1 : 0;
}
public String signature() {
return "rx";
}
public boolean wellformed(indexRWIVarEntry a) {
return true;
}
public static class minmaxfinder extends Thread { public static class minmaxfinder extends Thread {
@ -208,8 +182,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++))); iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++)));
this.decodedEntries.add(iEntry); this.decodedEntries.add(iEntry);
// find min/max // find min/max
if (this.entryMin == null) this.entryMin = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.min(this.entryMin, iEntry); if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);
if (this.entryMax == null) this.entryMax = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.max(this.entryMax, iEntry); if (this.entryMax == null) this.entryMax = iEntry.clone(); else this.entryMax.max(iEntry);
// update domcount // update domcount
dom = iEntry.urlHash().substring(6); dom = iEntry.urlHash().substring(6);
count = (Integer) doms.get(dom); count = (Integer) doms.get(dom);

@ -269,4 +269,7 @@ public final class indexRWIRowEntry implements indexRWIEntry {
return false; return false;
} }
public int hashCode() {
return this.urlHash().hashCode();
}
} }

@ -27,6 +27,7 @@
package de.anomic.index; package de.anomic.index;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaWordIndex;
public class indexRWIVarEntry implements indexRWIEntry { public class indexRWIVarEntry implements indexRWIEntry {
@ -40,7 +41,52 @@ public class indexRWIVarEntry implements indexRWIEntry {
worddistance, wordsintext, wordsintitle; worddistance, wordsintext, wordsintitle;
public double termFrequency; public double termFrequency;
public indexRWIVarEntry(indexRWIEntry e) { public indexRWIVarEntry(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, // how often appears this word in the text
int wordcount, // total number of words
int phrasecount, // total number of phrases
int posintext, // position of word in all words
int posinphrase, // position of word in its phrase
int posofphrase, // number of the phrase where word appears
long lastmodified, // last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
String language, // (guessed) language of document
char doctype, // type of document
int outlinksSame, // outlinks to same domain
int outlinksOther, // outlinks to other domain
kelondroBitfield flags, // attributes to the url and to the word according the url
int worddistance,
double termfrequency
) {
if ((language == null) || (language.length() != 2)) language = "uk";
int mddlm = plasmaWordIndex.microDateDays(lastmodified);
int mddct = plasmaWordIndex.microDateDays(updatetime);
this.flags = flags;
this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2);
this.lastModified = lastmodified;
this.language = language;
this.urlHash = urlHash;
this.type = doctype;
this.hitcount = hitcount;
this.llocal = outlinksSame;
this.lother = outlinksOther;
this.phrasesintext = outlinksOther;
this.posintext = posintext;
this.posinphrase = posinphrase;
this.posofphrase = posofphrase;
this.urlcomps = urlComps;
this.urllength = urlLength;
this.virtualAge = mddlm;
this.worddistance = worddistance;
this.wordsintext = wordcount;
this.wordsintitle = titleLength;
this.termFrequency = termfrequency;
}
public indexRWIVarEntry(indexRWIRowEntry e) {
this.flags = e.flags(); this.flags = e.flags();
this.freshUntil = e.freshUntil(); this.freshUntil = e.freshUntil();
this.lastModified = e.lastModified(); this.lastModified = e.lastModified();
@ -60,18 +106,43 @@ public class indexRWIVarEntry implements indexRWIEntry {
this.worddistance = 0; this.worddistance = 0;
this.wordsintext = e.wordsintext(); this.wordsintext = e.wordsintext();
this.wordsintitle = e.wordsintitle(); this.wordsintitle = e.wordsintitle();
this.termFrequency = 0.0; this.termFrequency = e.termFrequency();
}
public indexRWIVarEntry clone() {
indexRWIVarEntry c = new indexRWIVarEntry(
this.urlHash,
this.urllength,
this.urlcomps,
this.wordsintitle,
this.hitcount,
this.wordsintext,
this.phrasesintext,
this.posintext,
this.posinphrase,
this.posofphrase,
this.lastModified,
System.currentTimeMillis(),
this.language,
this.type,
this.llocal,
this.lother,
this.flags,
this.worddistance,
this.termFrequency);
return c;
} }
public void join(indexRWIVarEntry oe) { public void join(indexRWIVarEntry oe) {
// combine the distance // combine the distance
this.worddistance = this.worddistance() + oe.worddistance() + Math.abs(this.posintext() - oe.posintext()); this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
this.posintext = Math.min(this.posintext(), oe.posintext()); this.posintext = Math.min(this.posintext, oe.posintext);
this.posinphrase = (this.posofphrase() == oe.posofphrase()) ? Math.min(this.posinphrase(), oe.posinphrase()) : 0; this.posinphrase = (this.posofphrase == oe.posofphrase) ? Math.min(this.posinphrase, oe.posinphrase) : 0;
this.posofphrase = Math.min(this.posofphrase(), oe.posofphrase()); this.posofphrase = Math.min(this.posofphrase, oe.posofphrase);
// combine term frequency // combine term frequency
this.wordsintext = this.wordsintext() + oe.wordsintext(); this.wordsintext = this.wordsintext + oe.wordsintext;
this.termFrequency = this.termFrequency + oe.termFrequency;
} }
public kelondroBitfield flags() { public kelondroBitfield flags() {
@ -191,66 +262,65 @@ public class indexRWIVarEntry implements indexRWIEntry {
return this.termFrequency; return this.termFrequency;
} }
public static final void min(indexRWIVarEntry t, indexRWIVarEntry other) { public final void min(indexRWIVarEntry other) {
int v; int v;
long w; long w;
double d; double d;
if (t.hitcount() > (v = other.hitcount())) t.hitcount = v; if (this.hitcount > (v = other.hitcount)) this.hitcount = v;
if (t.llocal() > (v = other.llocal())) t.llocal = v; if (this.llocal > (v = other.llocal)) this.llocal = v;
if (t.lother() > (v = other.lother())) t.lother = v; if (this.lother > (v = other.lother)) this.lother = v;
if (t.virtualAge() > (v = other.virtualAge())) t.virtualAge = v; if (this.virtualAge > (v = other.virtualAge)) this.virtualAge = v;
if (t.wordsintext() > (v = other.wordsintext())) t.wordsintext = v; if (this.wordsintext > (v = other.wordsintext)) this.wordsintext = v;
if (t.phrasesintext() > (v = other.phrasesintext())) t.phrasesintext = v; if (this.phrasesintext > (v = other.phrasesintext)) this.phrasesintext = v;
if (t.posintext() > (v = other.posintext())) t.posintext = v; if (this.posintext > (v = other.posintext)) this.posintext = v;
if (t.posinphrase() > (v = other.posinphrase())) t.posinphrase = v; if (this.posinphrase > (v = other.posinphrase)) this.posinphrase = v;
if (t.posofphrase() > (v = other.posofphrase())) t.posofphrase = v; if (this.posofphrase > (v = other.posofphrase)) this.posofphrase = v;
if (t.worddistance() > (v = other.worddistance())) t.worddistance = v; if (this.worddistance > (v = other.worddistance)) this.worddistance = v;
if (t.lastModified() > (w = other.lastModified())) t.lastModified = w; if (this.lastModified > (w = other.lastModified)) this.lastModified = w;
if (t.freshUntil() > (w = other.freshUntil())) t.freshUntil = w; if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w;
if (t.urllength() > (v = other.urllength())) t.urllength = v; if (this.urllength > (v = other.urllength)) this.urllength = v;
if (t.urlcomps() > (v = other.urlcomps())) t.urlcomps = v; if (this.urlcomps > (v = other.urlcomps)) this.urlcomps = v;
if (t.wordsintitle() > (v = other.wordsintitle())) t.wordsintitle = v; if (this.wordsintitle > (v = other.wordsintitle)) this.wordsintitle = v;
if (t.termFrequency > (d = other.termFrequency())) t.termFrequency = d; if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d;
} }
public static final void max(indexRWIVarEntry t, indexRWIVarEntry other) { public final void max(indexRWIVarEntry other) {
int v; int v;
long w; long w;
double d; double d;
if (t.hitcount() < (v = other.hitcount())) t.hitcount = v; if (this.hitcount < (v = other.hitcount)) this.hitcount = v;
if (t.llocal() < (v = other.llocal())) t.llocal = v; if (this.llocal < (v = other.llocal)) this.llocal = v;
if (t.lother() < (v = other.lother())) t.lother = v; if (this.lother < (v = other.lother)) this.lother = v;
if (t.virtualAge() < (v = other.virtualAge())) t.virtualAge = v; if (this.virtualAge < (v = other.virtualAge)) this.virtualAge = v;
if (t.wordsintext() < (v = other.wordsintext())) t.wordsintext = v; if (this.wordsintext < (v = other.wordsintext)) this.wordsintext = v;
if (t.phrasesintext() < (v = other.phrasesintext())) t.phrasesintext = v; if (this.phrasesintext < (v = other.phrasesintext)) this.phrasesintext = v;
if (t.posintext() < (v = other.posintext())) t.posintext = v; if (this.posintext < (v = other.posintext)) this.posintext = v;
if (t.posinphrase() < (v = other.posinphrase())) t.posinphrase = v; if (this.posinphrase < (v = other.posinphrase)) this.posinphrase = v;
if (t.posofphrase() < (v = other.posofphrase())) t.posofphrase = v; if (this.posofphrase < (v = other.posofphrase)) this.posofphrase = v;
if (t.worddistance() < (v = other.worddistance())) t.worddistance = v; if (this.worddistance < (v = other.worddistance)) this.worddistance = v;
if (t.lastModified() < (w = other.lastModified())) t.lastModified = w; if (this.lastModified < (w = other.lastModified)) this.lastModified = w;
if (t.freshUntil() < (w = other.freshUntil())) t.freshUntil = w; if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w;
if (t.urllength() < (v = other.urllength())) t.urllength = v; if (this.urllength < (v = other.urllength)) this.urllength = v;
if (t.urlcomps() < (v = other.urlcomps())) t.urlcomps = v; if (this.urlcomps < (v = other.urlcomps)) this.urlcomps = v;
if (t.wordsintitle() < (v = other.wordsintitle())) t.wordsintitle = v; if (this.wordsintitle < (v = other.wordsintitle)) this.wordsintitle = v;
if (t.termFrequency < (d = other.termFrequency())) t.termFrequency = d; if (this.termFrequency < (d = other.termFrequency)) this.termFrequency = d;
} }
public static void join(indexRWIVarEntry ie1, indexRWIEntry ie2) { public void join(indexRWIEntry oe) {
// returns a modified entry of the first argument // joins two entries into one entry
// combine the distance // combine the distance
ie1.worddistance = ie1.worddistance + ((ie2 instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) ie2).worddistance() : 0) + Math.abs(ie1.posintext() - ie2.posintext()); this.worddistance = this.worddistance + ((oe instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) oe).worddistance : 0) + Math.abs(this.posintext() - oe.posintext());
ie1.posintext = Math.min(ie1.posintext(), ie2.posintext()); this.posintext = Math.min(this.posintext, oe.posintext());
ie1.posinphrase = (ie1.posofphrase() == ie2.posofphrase()) ? Math.min(ie1.posinphrase(), ie2.posinphrase()) : 0; this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0;
ie1.posofphrase = Math.min(ie1.posofphrase(), ie2.posofphrase()); this.posofphrase = Math.min(this.posofphrase, oe.posofphrase());
// combine term frequency // combine term frequency
ie1.termFrequency = ie1.termFrequency + ie2.termFrequency(); this.termFrequency = this.termFrequency + oe.termFrequency();
ie1.wordsintext = ie1.wordsintext() + ie2.wordsintext(); this.wordsintext = this.wordsintext + oe.wordsintext();
}
public void join(indexRWIEntry oe) {
join(this, oe);
} }
public int hashCode() {
return this.urlHash.hashCode();
}
} }

@ -0,0 +1,147 @@
// kelondroSortStack.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 20.02.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
public class kelondroSortStack<E> {
// implements a stack where elements 'float' on-top of the stack according to a weight value.
// objects pushed on the stack must implement the hashCode() method to provide a handle
// for a double-check.
protected TreeMap<Long, E> onstack; // object within the stack
protected HashSet<Integer> instack; // keeps track which element has been on the stack or is now in the offstack
protected int maxsize;
public kelondroSortStack(int maxsize) {
// the maxsize is the maximum number of entries in the stack
// if this is set to -1, the size is unlimited
this.onstack = new TreeMap<Long, E>();
this.instack = new HashSet<Integer>();
this.maxsize = maxsize;
}
public int size() {
return this.onstack.size();
}
public synchronized void push(stackElement se) {
push(se.element, se.weight);
}
public synchronized void push(E element, long weight) {
if (exists(element)) return;
// manipulate weight in such a way that it has no conflicts
Long w = new Long(weight);
while (this.onstack.containsKey(w)) w = new Long(w.longValue() + 1);
// put the element on the stack
this.onstack.put(w, element);
// register it for double-check
this.instack.add(element.hashCode());
// check maximum size of the stack an remove elements if the stack gets too large
if (this.maxsize <= 0) return;
while ((this.onstack.size() > 0) && (this.onstack.size() > this.maxsize)) {
this.onstack.remove(this.onstack.lastKey());
}
}
public synchronized stackElement top() {
// returns the element that is currently on top of the stack
if (this.onstack.size() == 0) return null;
Long w = this.onstack.firstKey();
E element = this.onstack.get(w);
return new stackElement(element, w.longValue());
}
public synchronized stackElement pop() {
// returns the element that is currently on top of the stack
// it is removed and added to the offstack list
// this is exactly the same as element(offstack.size())
if (this.onstack.size() == 0) return null;
Long w = this.onstack.firstKey();
E element = this.onstack.remove(w);
stackElement se = new stackElement(element, w.longValue());
return se;
}
public boolean exists(E element) {
// uses the hashCode of the element to find out of the element had been on the list or the stack
return this.instack.contains(element.hashCode());
}
public boolean exists(int hashcode) {
// uses the hashCode of the element to find out of the element had been on the list or the stack
return this.instack.contains(hashcode);
}
public stackElement get(int hashcode) {
Iterator<Map.Entry<Long, E>> i = this.onstack.entrySet().iterator();
Map.Entry<Long, E> entry;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue().hashCode() == hashcode) return new stackElement(entry.getValue(), entry.getKey().longValue());
}
return null;
}
public stackElement remove(int hashcode) {
Iterator<Map.Entry<Long, E>> i = this.onstack.entrySet().iterator();
Map.Entry<Long, E> entry;
stackElement se;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue().hashCode() == hashcode) {
se = new stackElement(entry.getValue(), entry.getKey().longValue());
this.onstack.remove(se.weight);
return se;
}
}
return null;
}
public boolean bottom(long weight) {
// returns true if the element with that weight would be on the bottom of the stack after inserting
return weight > this.onstack.lastKey().longValue();
}
public class stackElement {
public long weight;
public E element;
public stackElement(E element, long weight) {
this.element = element;
this.weight = weight;
}
}
}

@ -0,0 +1,135 @@
// kelondroSortStore.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 20.02.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.util.ArrayList;
import java.util.Iterator;
public class kelondroSortStore<E> extends kelondroSortStack<E> {
// extends the sortStack in such a way that it adds a list where objects, that had
// been pulled from the stack with pop are listed. Provides access methods to address
// specific elements in the list.
private ArrayList<stackElement> offstack; // objects that had been on the stack but had been removed
public kelondroSortStore(int maxsize) {
super(maxsize);
this.offstack = new ArrayList<stackElement>();
}
public int size() {
return super.onstack.size() + this.offstack.size();
}
public int sizeStore() {
return this.offstack.size();
}
public synchronized void push(E element, long weight) {
super.push(element, weight);
if (this.maxsize <= 0) return;
while ((this.onstack.size() > 0) && (super.onstack.size() + this.offstack.size() > this.maxsize)) {
this.onstack.remove(this.onstack.lastKey());
}
}
public synchronized stackElement pop() {
// returns the element that is currently on top of the stack
// it is removed and added to the offstack list
// this is exactly the same as element(offstack.size())
stackElement se = super.pop();
if (se == null) return null;
this.offstack.add(se);
return se;
}
public synchronized stackElement element(int position) {
// returns an element from a specific position. It is either taken from the offstack,
// or removed from the onstack.
// The offstack will grow if elements are not from the offstack and present at the onstack.
if (position < this.offstack.size()) {
return this.offstack.get(position);
}
if (position >= size()) return null; // we don't have that element
while (position >= this.offstack.size()) {
Long w = this.onstack.firstKey();
E element = this.onstack.remove(w);
stackElement se = new stackElement(element, w.longValue());
this.offstack.add(se);
}
return this.offstack.get(position);
}
public ArrayList<stackElement> list(int count) {
// returns the specific amount of entries. If they are not yet present in the offstack, they are shifted there from the onstack
// if count is < 0 then all elements are taken
// the returned list is not cloned from the internal list and shall not be modified in any way (read-only)
if (count < 0) {
// shift all elements
while (this.onstack.size() > 0) {
Long w = this.onstack.firstKey();
E element = this.onstack.remove(w);
stackElement se = new stackElement(element, w.longValue());
this.offstack.add(se);
}
return this.offstack;
}
if (size() < count) throw new RuntimeException("list(" + count + ") exceeded avaiable number of elements (" + size() + ")");
while (this.onstack.size() < count) {
Long w = this.onstack.firstKey();
E element = this.onstack.remove(w);
stackElement se = new stackElement(element, w.longValue());
this.offstack.add(se);
}
return this.offstack;
}
public stackElement get(int hashcode) {
stackElement se = super.get(hashcode);
if (se != null) return se;
Iterator<stackElement> j = this.offstack.iterator();
while (j.hasNext()) {
se = j.next();
if (se.element.hashCode() == hashcode) return se;
}
return null;
}
public stackElement remove(int hashcode) {
stackElement se = super.remove(hashcode);
if (se != null) return se;
for (int j = 0; j < this.offstack.size(); j++) {
se = this.offstack.get(j);
if (se.element.hashCode() == hashcode) {
this.offstack.remove(j);
return se;
}
}
return null;
}
}

@ -38,9 +38,12 @@ import java.util.TreeSet;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIVarEntry;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroSortStack;
import de.anomic.kelondro.kelondroSortStore;
import de.anomic.plasma.plasmaSnippetCache.MediaSnippet; import de.anomic.plasma.plasmaSnippetCache.MediaSnippet;
import de.anomic.server.serverProfiling; import de.anomic.server.serverProfiling;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
@ -77,8 +80,7 @@ public final class plasmaSearchEvent {
public TreeMap<String, Integer> IACount; public TreeMap<String, Integer> IACount;
public String IAmaxcounthash, IAneardhthash; public String IAmaxcounthash, IAneardhthash;
private resultWorker[] workerThreads; private resultWorker[] workerThreads;
private ArrayList<ResultEntry> resultList; private kelondroSortStore<ResultEntry> result;
//private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again
private HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string private HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string
TreeSet<String> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets TreeSet<String> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
private long urlRetrievalAllTime; private long urlRetrievalAllTime;
@ -104,8 +106,7 @@ public final class plasmaSearchEvent {
this.snippetComputationAllTime = 0; this.snippetComputationAllTime = 0;
this.workerThreads = null; this.workerThreads = null;
this.localSearchThread = null; this.localSearchThread = null;
this.resultList = new ArrayList<ResultEntry>(10); // this is the result set which is filled up with search results, enriched with snippets this.result = new kelondroSortStore<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
//this.resultListLock = 0; // no locked elements until now
this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed. this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
// snippets do not need to match with the complete query hashes, // snippets do not need to match with the complete query hashes,
@ -202,7 +203,7 @@ public final class plasmaSearchEvent {
ResultEntry resultEntry; ResultEntry resultEntry;
yacyURL url; yacyURL url;
synchronized (rankedCache) { synchronized (rankedCache) {
while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (resultList.size() < (query.neededResults()))) { while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (result.size() < (query.neededResults()))) {
url = uentry.comp().url(); url = uentry.comp().url();
if (url == null) continue; if (url == null) continue;
//System.out.println("***DEBUG*** SEARCH RESULT URL=" + url.toNormalform(false, false)); //System.out.println("***DEBUG*** SEARCH RESULT URL=" + url.toNormalform(false, false));
@ -213,9 +214,7 @@ public final class plasmaSearchEvent {
snippetComputationAllTime += resultEntry.snippetComputationTime; snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector // place the result to the result vector
synchronized (resultList) { result.push(resultEntry, rankedCache.getOrder().cardinal(resultEntry.word()));
resultList.add(resultEntry);
}
// add references // add references
synchronized (rankedCache) { synchronized (rankedCache) {
@ -223,7 +222,7 @@ public final class plasmaSearchEvent {
} }
} }
} }
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "offline snippet fetch", resultList.size(), System.currentTimeMillis() - timer)); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "offline snippet fetch", result.size(), System.currentTimeMillis() - timer));
} }
// clean up events // clean up events
@ -466,8 +465,8 @@ public final class plasmaSearchEvent {
// if worker threads had been alive, but did not succeed, start them again to fetch missing links // if worker threads had been alive, but did not succeed, start them again to fetch missing links
if ((query.onlineSnippetFetch) && if ((query.onlineSnippetFetch) &&
(!event.anyWorkerAlive()) && (!event.anyWorkerAlive()) &&
(event.resultList.size() < query.neededResults() + 10) && (event.result.size() < query.neededResults() + 10) &&
((event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize()) > event.resultList.size())) { (event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.result.size())) {
// set new timeout // set new timeout
event.eventTime = System.currentTimeMillis(); event.eventTime = System.currentTimeMillis();
// start worker threads to fetch urls and snippets // start worker threads to fetch urls and snippets
@ -508,7 +507,7 @@ public final class plasmaSearchEvent {
while (System.currentTimeMillis() < this.timeout) { while (System.currentTimeMillis() < this.timeout) {
this.lastLifeSign = System.currentTimeMillis(); this.lastLifeSign = System.currentTimeMillis();
if (resultList.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough if (result.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough
// get next entry // get next entry
page = rankedCache.bestURL(true); page = rankedCache.bestURL(true);
@ -531,21 +530,8 @@ public final class plasmaSearchEvent {
//System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring()); //System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring());
// place the result to the result vector // place the result to the result vector
boolean d = false; if (!result.exists(resultEntry)) {
synchronized (resultList) { result.push(resultEntry, rankedCache.getOrder().cardinal(resultEntry.word()));
doublecheck: for (int i = 0; i < resultList.size(); i++) {
if (resultList.get(i).urlcomps.url().hash().equals(resultEntry.urlcomps.url().hash())) {
d = true;
break doublecheck;
}
}
if (!d) {
resultList.add(resultEntry);
}
}
// add references
if (!d) synchronized (rankedCache) {
rankedCache.addReferences(resultEntry); rankedCache.addReferences(resultEntry);
} }
//System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url()); //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
@ -554,10 +540,7 @@ public final class plasmaSearchEvent {
} }
private boolean anyResultWith(String urlhash) { private boolean anyResultWith(String urlhash) {
for (int i = 0; i < resultList.size(); i++) { return result.exists(urlhash.hashCode());
if (((ResultEntry) resultList.get(i)).urlentry.hash().equals(urlhash)) return true;
}
return false;
} }
private boolean anyFailureWith(String urlhash) { private boolean anyFailureWith(String urlhash) {
@ -576,6 +559,11 @@ public final class plasmaSearchEvent {
public ResultEntry oneResult(int item) { public ResultEntry oneResult(int item) {
// first sleep a while to give accumulation threads a chance to work // first sleep a while to give accumulation threads a chance to work
if (this.result.sizeStore() > item) {
// we have the wanted result already in the result array .. return that
return this.result.element(item).element;
}
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) || if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) { (query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
// this is a search using remote search threads. Also the local search thread is started as background process // this is a search using remote search threads. Also the local search thread is started as background process
@ -586,45 +574,28 @@ public final class plasmaSearchEvent {
} }
// now wait until as many remote worker threads have finished, as we want to display results // now wait until as many remote worker threads have finished, as we want to display results
while ((this.primarySearchThreads != null) && (this.primarySearchThreads.length > item) && (anyWorkerAlive()) && while ((this.primarySearchThreads != null) && (this.primarySearchThreads.length > item) && (anyWorkerAlive()) &&
((this.resultList.size() <= item) || (countFinishedRemoteSearch() <= item))) { ((result.size() <= item) || (countFinishedRemoteSearch() <= item))) {
try {Thread.sleep(100);} catch (InterruptedException e) {} try {Thread.sleep(100);} catch (InterruptedException e) {}
} }
} }
// finally wait until enough results are there produced from the snippet fetch process // finally wait until enough results are there produced from the snippet fetch process
while ((anyWorkerAlive()) && (this.resultList.size() <= item)) { while ((anyWorkerAlive()) && (result.size() <= item)) {
try {Thread.sleep(100);} catch (InterruptedException e) {} try {Thread.sleep(100);} catch (InterruptedException e) {}
} }
// finally, if there is something, return the result // finally, if there is something, return the result
synchronized (this.resultList) { if (this.result.size() <= item) return null;
// check if we have enough entries return this.result.element(item).element;
if (this.resultList.size() <= item) return null;
// fetch the best entry from the resultList, not the entry from item position
// whenever a specific entry was switched in its position and was returned here
// a moving pointer is set to assign that item position as not changeable
int bestpick = item; //postRankingFavourite(item);
if (bestpick != item) {
// switch the elements
ResultEntry buf = (ResultEntry) this.resultList.get(bestpick);
serverLog.logInfo("SEARCH_POSTRANKING", "prefering [" + bestpick + "] " + buf.urlstring() + " over [" + item + "] " + ((ResultEntry) this.resultList.get(item)).urlstring());
this.resultList.set(bestpick, (ResultEntry) this.resultList.get(item));
this.resultList.set(item, buf);
}
//this.resultListLock = item; // lock the element; be prepared to return it
return (ResultEntry) this.resultList.get(item);
}
} }
public ArrayList<ResultEntry> completeResults(long waitingtime) { public ArrayList<kelondroSortStack<ResultEntry>.stackElement> completeResults(long waitingtime) {
long timeout = System.currentTimeMillis() + waitingtime; long timeout = System.currentTimeMillis() + waitingtime;
while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) { while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
try {Thread.sleep(100);} catch (InterruptedException e) {} try {Thread.sleep(100);} catch (InterruptedException e) {}
//System.out.println("+++DEBUG-completeResults+++ sleeping " + 200); //System.out.println("+++DEBUG-completeResults+++ sleeping " + 200);
} }
return this.resultList; return this.result.list(this.result.size());
} }
boolean secondarySearchStartet = false; boolean secondarySearchStartet = false;
@ -789,7 +760,9 @@ public final class plasmaSearchEvent {
if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p); if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p);
} }
} }
public int hashCode() {
return urlentry.hash().hashCode();
}
public String hash() { public String hash() {
return urlentry.hash(); return urlentry.hash();
} }
@ -832,8 +805,10 @@ public final class plasmaSearchEvent {
public int lapp() { public int lapp() {
return urlentry.lapp(); return urlentry.lapp();
} }
public indexRWIEntry word() { public indexRWIVarEntry word() {
return urlentry.word(); indexRWIEntry word = urlentry.word();
assert word instanceof indexRWIVarEntry;
return (indexRWIVarEntry) word;
} }
public boolean hasTextSnippet() { public boolean hasTextSnippet() {
return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11); return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11);

@ -33,7 +33,6 @@ import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -45,6 +44,7 @@ import de.anomic.index.indexRWIVarEntry;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroSortStack;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.server.serverProfiling; import de.anomic.server.serverProfiling;
@ -54,15 +54,15 @@ public final class plasmaSearchRankingProcess {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true; private static boolean useYBR = true;
private TreeMap<Object, indexRWIVarEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String private kelondroSortStack<indexRWIVarEntry> stack;
private HashMap<String, TreeMap<Object, indexRWIVarEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries private HashMap<String, kelondroSortStack<indexRWIVarEntry>> doubleDomCache; // key = domhash (6 bytes); value = like stack
private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
private plasmaSearchQuery query; private plasmaSearchQuery query;
private int sortorder; private int sortorder;
private int maxentries; private int maxentries;
private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize; private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize;
private indexRWIEntryOrder order; private indexRWIEntryOrder order;
private HashMap<String, Object> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private HashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private kelondroMScoreCluster<String> ref; // reference score computation for the commonSense heuristic private kelondroMScoreCluster<String> ref; // reference score computation for the commonSense heuristic
private int[] flagcount; // flag counter private int[] flagcount; // flag counter
private TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB private TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
@ -74,17 +74,17 @@ public final class plasmaSearchRankingProcess {
// attention: if minEntries is too high, this method will not terminate within the maxTime // attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking // sortorder: 0 = hash, 1 = url, 2 = ranking
this.localSearchContainerMaps = null; this.localSearchContainerMaps = null;
this.sortedRWIEntries = new TreeMap<Object, indexRWIVarEntry>(); this.stack = new kelondroSortStack<indexRWIVarEntry>(maxentries);
this.doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIVarEntry>>(); this.doubleDomCache = new HashMap<String, kelondroSortStack<indexRWIVarEntry>>();
this.handover = new HashMap<String, String>(); this.handover = new HashMap<String, String>();
this.order = null; this.order = (query == null) ? null : new indexRWIEntryOrder(query.ranking);
this.query = query; this.query = query;
this.maxentries = maxentries; this.maxentries = maxentries;
this.remote_peerCount = 0; this.remote_peerCount = 0;
this.remote_indexCount = 0; this.remote_indexCount = 0;
this.remote_resourceSize = 0; this.remote_resourceSize = 0;
this.local_resourceSize = 0; this.local_resourceSize = 0;
this.urlhashes = new HashMap<String, Object>(); this.urlhashes = new HashMap<String, Integer>();
this.ref = new kelondroMScoreCluster<String>(); this.ref = new kelondroMScoreCluster<String>();
this.misses = new TreeSet<String>(); this.misses = new TreeSet<String>();
this.wordIndex = wordIndex; this.wordIndex = wordIndex;
@ -93,6 +93,10 @@ public final class plasmaSearchRankingProcess {
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
} }
public long ranking(indexRWIVarEntry word) {
return order.cardinal(word);
}
public void execQuery() { public void execQuery() {
long timer = System.currentTimeMillis(); long timer = System.currentTimeMillis();
@ -150,21 +154,21 @@ public final class plasmaSearchRankingProcess {
// load url // load url
if (sortorder == 0) { if (sortorder == 0) {
this.sortedRWIEntries.put(ientry.urlHash(), ientry); this.stack.push(ientry, ientry.urlHash().hashCode());
this.urlhashes.put(ientry.urlHash(), ientry.urlHash()); this.urlhashes.put(ientry.urlHash(), ientry.urlHash().hashCode());
} else { } else {
uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0); uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) { if (uentry == null) {
this.misses.add(ientry.urlHash()); this.misses.add(ientry.urlHash());
} else { } else {
u = uentry.comp().url().toNormalform(false, true); u = uentry.comp().url().toNormalform(false, true);
this.sortedRWIEntries.put(u, ientry); this.stack.push(ientry, u.hashCode());
this.urlhashes.put(ientry.urlHash(), u); this.urlhashes.put(ientry.urlHash(), u.hashCode());
} }
} }
// interrupt if we have enough // interrupt if we have enough
if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop; if ((query.neededResults() > 0) && (this.misses.size() + this.stack.size() > query.neededResults())) break loop;
} // end loop } // end loop
} }
@ -182,22 +186,20 @@ public final class plasmaSearchRankingProcess {
} }
long timer = System.currentTimeMillis(); long timer = System.currentTimeMillis();
if (this.order == null) {
this.order = new indexRWIEntryOrder(query.ranking); // normalize entries
}
ArrayList<indexRWIVarEntry> decodedEntries = this.order.normalizeWith(index); ArrayList<indexRWIVarEntry> decodedEntries = this.order.normalizeWith(index);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer)); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer));
// normalize entries and get ranking // iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis(); timer = System.currentTimeMillis();
Iterator<indexRWIVarEntry> i = decodedEntries.iterator(); Iterator<indexRWIVarEntry> i = decodedEntries.iterator();
indexRWIVarEntry iEntry, l; indexRWIVarEntry iEntry;
long biggestEntry = 0;
//long s0 = System.currentTimeMillis();
Long r; Long r;
while (i.hasNext()) { while (i.hasNext()) {
iEntry = i.next(); iEntry = i.next();
if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; assert (iEntry.urlHash().length() == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
// increase flag counts // increase flag counts
for (int j = 0; j < 32; j++) { for (int j = 0; j < 32; j++) {
@ -206,31 +208,32 @@ public final class plasmaSearchRankingProcess {
// kick out entries that are too bad according to current findings // kick out entries that are too bad according to current findings
r = new Long(order.cardinal(iEntry)); r = new Long(order.cardinal(iEntry));
if ((maxentries >= 0) && (sortedRWIEntries.size() >= maxentries) && (r.longValue() > biggestEntry)) continue; if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;
// check constraints // check constraints
if (!testFlags(iEntry)) continue; if (!testFlags(iEntry)) continue;
// check document domain
if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
} }
if ((maxentries < 0) || (sortedRWIEntries.size() < maxentries)) {
// insert
if ((maxentries < 0) || (stack.size() < maxentries)) {
// in case that we don't have enough yet, accept any new entry
if (urlhashes.containsKey(iEntry.urlHash())) continue; if (urlhashes.containsKey(iEntry.urlHash())) continue;
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1); stack.push(iEntry, r);
sortedRWIEntries.put(r, iEntry);
} else { } else {
if (r.longValue() > biggestEntry) { // if we already have enough entries, insert only such that are necessary to get a better result
if (stack.bottom(r.longValue())) {
continue; continue;
} else { } else {
// double-check
if (urlhashes.containsKey(iEntry.urlHash())) continue; if (urlhashes.containsKey(iEntry.urlHash())) continue;
l = sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey()); stack.push(iEntry, r);
urlhashes.remove(l.urlHash());
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
sortedRWIEntries.put(r, iEntry);
biggestEntry = order.cardinal(sortedRWIEntries.get(sortedRWIEntries.lastKey()));
} }
} }
@ -271,85 +274,69 @@ public final class plasmaSearchRankingProcess {
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) { private synchronized kelondroSortStack<indexRWIVarEntry>.stackElement bestRWI(boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removed this entry from the list // returns from the current RWI list the best entry and removed this entry from the list
Object bestEntry; kelondroSortStack<indexRWIVarEntry> m;
TreeMap<Object, indexRWIVarEntry> m; kelondroSortStack<indexRWIVarEntry>.stackElement rwi;
indexRWIVarEntry rwi; while (stack.size() > 0) {
while (sortedRWIEntries.size() > 0) { rwi = stack.pop();
bestEntry = sortedRWIEntries.firstKey(); if (!skipDoubleDom) return rwi;
rwi = sortedRWIEntries.remove(bestEntry);
if (!skipDoubleDom) return new Object[]{bestEntry, rwi};
// check doubledom // check doubledom
String domhash = rwi.urlHash().substring(6); String domhash = rwi.element.urlHash().substring(6);
m = this.doubleDomCache.get(domhash); m = this.doubleDomCache.get(domhash);
if (m == null) { if (m == null) {
// first appearance of dom // first appearance of dom
m = new TreeMap<Object, indexRWIVarEntry>(); m = new kelondroSortStack<indexRWIVarEntry>(-1);
this.doubleDomCache.put(domhash, m); this.doubleDomCache.put(domhash, m);
return new Object[]{bestEntry, rwi}; return rwi;
} }
// second appearances of dom // second appearances of dom
m.put(bestEntry, rwi); m.push(rwi);
} }
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches // find best entry from all caches
Iterator<TreeMap<Object, indexRWIVarEntry>> i = this.doubleDomCache.values().iterator(); Iterator<kelondroSortStack<indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
bestEntry = null; kelondroSortStack<indexRWIVarEntry>.stackElement bestEntry = null;
Object o; kelondroSortStack<indexRWIVarEntry>.stackElement o;
indexRWIVarEntry bestrwi = null;
while (i.hasNext()) { while (i.hasNext()) {
m = i.next(); m = i.next();
if (m.size() == 0) continue; if (m.size() == 0) continue;
if (bestEntry == null) { if (bestEntry == null) {
bestEntry = m.firstKey(); bestEntry = m.top();
bestrwi = m.remove(bestEntry);
continue; continue;
} }
o = m.firstKey(); o = m.top();
rwi = m.remove(o); if (o.weight < bestEntry.weight) {
if (o instanceof Long) { bestEntry = o;
if (((Long) o).longValue() < ((Long) bestEntry).longValue()) {
bestEntry = o;
bestrwi = rwi;
}
}
if (o instanceof String) {
if (((String) o).compareTo((String) bestEntry) < 0) {
bestEntry = o;
bestrwi = rwi;
}
} }
} }
if (bestrwi == null) return null; if (bestEntry == null) return null;
// finally remove the best entry from the doubledom cache // finally remove the best entry from the doubledom cache
m = this.doubleDomCache.get(bestrwi.urlHash().substring(6)); m = this.doubleDomCache.get(bestEntry.element.urlHash().substring(6));
m.remove(bestEntry); o = m.pop();
return new Object[]{bestEntry, bestrwi}; assert o.element.urlHash().equals(bestEntry.element.urlHash());
return bestEntry;
} }
public synchronized indexURLEntry bestURL(boolean skipDoubleDom) { public synchronized indexURLEntry bestURL(boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removed this entry from the list // returns from the current RWI list the best URL entry and removed this entry from the list
while ((sortedRWIEntries.size() > 0) || (size() > 0)) { while ((stack.size() > 0) || (size() > 0)) {
Object[] obrwi = bestRWI(skipDoubleDom); kelondroSortStack<indexRWIVarEntry>.stackElement obrwi = bestRWI(skipDoubleDom);
Object bestEntry = obrwi[0]; indexURLEntry u = wordIndex.loadedURL.load(obrwi.element.urlHash(), obrwi.element, obrwi.weight);
indexRWIVarEntry ientry = (indexRWIVarEntry) obrwi[1];
long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0;
indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking);
if (u != null) { if (u != null) {
indexURLEntry.Components comp = u.comp(); indexURLEntry.Components comp = u.comp();
if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url
return u; return u;
} }
misses.add(ientry.urlHash()); misses.add(obrwi.element.urlHash());
} }
return null; return null;
} }
public synchronized int size() { public synchronized int size() {
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
int c = sortedRWIEntries.size(); int c = stack.size();
Iterator<TreeMap<Object, indexRWIVarEntry>> i = this.doubleDomCache.values().iterator(); Iterator<kelondroSortStack<indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
while (i.hasNext()) c += i.next().size(); while (i.hasNext()) c += i.next().size();
return c; return c;
} }
@ -362,7 +349,7 @@ public final class plasmaSearchRankingProcess {
public int filteredCount() { public int filteredCount() {
// the number of index entries that are considered as result set // the number of index entries that are considered as result set
return this.sortedRWIEntries.size(); return this.stack.size();
} }
public int getRemoteIndexCount() { public int getRemoteIndexCount() {
@ -385,14 +372,11 @@ public final class plasmaSearchRankingProcess {
return this.local_resourceSize; return this.local_resourceSize;
} }
public indexRWIEntry remove(String urlHash) { public indexRWIEntry remove(String urlHash) {
Object r = (Long) urlhashes.get(urlHash); kelondroSortStack<indexRWIVarEntry>.stackElement se = stack.remove(urlHash.hashCode());
if (r == null) return null; if (se == null) return null;
assert sortedRWIEntries.containsKey(r);
indexRWIEntry iEntry = (indexRWIEntry) sortedRWIEntries.remove(r);
urlhashes.remove(urlHash); urlhashes.remove(urlHash);
return iEntry; return se.element;
} }
public Iterator<String> miss() { public Iterator<String> miss() {

@ -1,7 +1,7 @@
# YaCy Network Group Definition # YaCy Network Group Definition
# ----------------------------- # -----------------------------
# This is an addition to the yacy.network.unit configuration file. # This is an addition to the yacy.network.unit configuration file.
# This file is adressed by the network.group.definition property in yacy.init # This file is addressed by the network.group.definition property in yacy.init
# The purpose of a group within a network is that some parts of a network may be managed independently, # The purpose of a group within a network is that some parts of a network may be managed independently,
# while the content of the network stays private for the whole network, mostly for a special purpose. # while the content of the network stays private for the whole network, mostly for a special purpose.
# This file needs to be configured if someone wants to participate with several peers to the network, # This file needs to be configured if someone wants to participate with several peers to the network,

Loading…
Cancel
Save