added normalization to search attributes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1528 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 7bd61ab0e5
commit 0bc2aaeb42

@ -64,6 +64,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
private plasmaCrawlLURL urlStore;
private plasmaSnippetCache snippetCache;
private plasmaWordIndexEntryContainer rcLocal, rcGlobal; // caches for results
private int rcGlobalCount;
private plasmaSearchProfile profileLocal, profileGlobal;
private yacySearch[] searchThreads;
@ -75,6 +76,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
this.snippetCache = snippetCache;
this.rcLocal = new plasmaWordIndexEntryContainer(null);
this.rcGlobal = new plasmaWordIndexEntryContainer(null);
this.rcGlobalCount = 0;
if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
this.profileLocal = new plasmaSearchProfile(4 * query.maximumTime / 10, query.wantedResults);
this.profileGlobal = new plasmaSearchProfile(6 * query.maximumTime / 10, query.wantedResults);
@ -118,7 +120,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
plasmaSearchResult result = order();
result.globalContributions = globalContributions;
result.localContributions = rcLocal.size();
flushResults();
flushGlobalResults(); // make these values available for immediate next search
// flush results in a separate thread
this.start(); // start to flush results
@ -184,7 +186,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// wait until wanted delay passed or wanted result appeared
while (System.currentTimeMillis() < timeout) {
// check if all threads have been finished or results so far are enough
if (rcGlobal.size() >= profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) * 3) break; // we have enough
if (rcGlobal.size() >= profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) * 5) break; // we have enough
if (yacySearch.remainingWaiting(searchThreads) == 0) break; // we cannot expect more
// wait a little time ..
try {Thread.sleep(100);} catch (InterruptedException e) {}
@ -264,10 +266,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// this method waits until all threads are finished
int remaining;
int allcount = 0;
long starttime = System.currentTimeMillis();
while ((searchThreads != null) && ((remaining = yacySearch.remainingWaiting(searchThreads)) > 0)) {
allcount += flushResults();
flushGlobalResults();
// wait a little bit before trying again
try {Thread.sleep(3000);} catch (InterruptedException e) {}
@ -279,7 +280,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
log.logFine("FINISHED FLUSH RESULTS PROCESS for query " + query.hashes(","));
}
serverLog.logFine("PLASMA", "FINISHED FLUSHING " + allcount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords);
serverLog.logFine("PLASMA", "FINISHED FLUSHING " + rcGlobalCount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords);
// finally delete the temporary index
rcGlobal = null;
@ -287,7 +288,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
flushThreads.remove(this);
}
public int flushResults() {
public void flushGlobalResults() {
// flush the rcGlobal as much as is there so far
// this must be called sometime after search results had been computed
int count = 0;
@ -306,7 +307,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
rcGlobal.clear();
}
}
return count;
rcGlobalCount += count;
}
}

@ -56,6 +56,7 @@ public final class plasmaSearchPreOrder {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true;
private plasmaWordIndexEntry entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query;
@ -96,6 +97,8 @@ public final class plasmaSearchPreOrder {
}
public plasmaSearchPreOrder(plasmaSearchQuery query) {
entryMin = null;
entryMax = null;
this.pageAcc = new TreeMap();
this.query = query;
}
@ -116,6 +119,7 @@ public final class plasmaSearchPreOrder {
return (plasmaWordIndexEntry) pageAcc.remove(top);
}
/*
public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
Iterator i = container.entries();
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
@ -142,9 +146,48 @@ public final class plasmaSearchPreOrder {
ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
}
*/
public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
plasmaWordIndexEntry entry;
// first pass: find min/max to obtain limits for normalization
Iterator i = container.entries();
int count = 0;
while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break;
entry = (plasmaWordIndexEntry) i.next();
if (entryMin == null) entryMin = (plasmaWordIndexEntry) entry.clone(); else entryMin.min(entry);
if (entryMax == null) entryMax = (plasmaWordIndexEntry) entry.clone(); else entryMax.max(entry);
count++;
}
// second pass: normalize entries
i = container.entries();
for (int j = 0; j < count; j++) {
entry = (plasmaWordIndexEntry) i.next();
entry.normalize(entryMin, entryMax);
addEntry(entry);
}
}
public void addEntry(plasmaWordIndexEntry indexEntry) {
long ranking = 0;
for (int i = 0; i < 3; i++) {
if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality() << (4 * (3 - i));
else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += indexEntry.getVirtualAge() << (4 * (3 - i));
else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += ybr_p(indexEntry.getUrlHash()) << (4 * (3 - i));
}
ranking += (indexEntry.posintext() == 0) ? 0 : (255 - indexEntry.posintext()) << 11;
ranking += (indexEntry.worddistance() == 0) ? 0 : (255 - indexEntry.worddistance()) << 10;
ranking += (indexEntry.hitcount() == 0) ? 0 : indexEntry.hitcount() << 9;
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
}
public static int ybr_p(String urlHash) {
return 16 - ybr(urlHash);
return 16 * (16 - ybr(urlHash));
}
public static int ybr(String urlHash) {

@ -56,7 +56,7 @@ import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
// import de.anomic.server.logging.serverLog;
public final class plasmaWordIndexEntry {
public final class plasmaWordIndexEntry implements Cloneable {
// an wordEntry can be filled in either of two ways:
// by the discrete values of the entry
@ -297,6 +297,10 @@ public final class plasmaWordIndexEntry {
this.localflag = pr.getProperty("f", ""+LT_LOCAL).charAt(0);
}
public Object clone() {
return new plasmaWordIndexEntry(this.toExternalForm());
}
public String toEncodedForm() {
// attention: this integrates NOT the URL hash into the encoding
// if you need a complete dump, use toExternalForm()
@ -348,6 +352,42 @@ public final class plasmaWordIndexEntry {
this.wordcount = (this.wordcount + oe.wordcount) / 2;
}
public void min(plasmaWordIndexEntry other) {
if (this.hitcount > other.hitcount) this.hitcount = other.hitcount;
if (this.wordcount > other.wordcount) this.wordcount = other.wordcount;
if (this.phrasecount > other.phrasecount) this.phrasecount = other.phrasecount;
if (this.posintext > other.posintext) this.posintext = other.posintext;
if (this.posinphrase > other.posinphrase) this.posinphrase = other.posinphrase;
if (this.posofphrase > other.posofphrase) this.posofphrase = other.posofphrase;
if (this.worddistance > other.worddistance) this.worddistance = other.worddistance;
if (this.lastModified > other.lastModified) this.lastModified = other.lastModified;
if (this.quality > other.quality) this.quality = other.quality;
}
public void max(plasmaWordIndexEntry other) {
if (this.hitcount < other.hitcount) this.hitcount = other.hitcount;
if (this.wordcount < other.wordcount) this.wordcount = other.wordcount;
if (this.phrasecount < other.phrasecount) this.phrasecount = other.phrasecount;
if (this.posintext < other.posintext) this.posintext = other.posintext;
if (this.posinphrase < other.posinphrase) this.posinphrase = other.posinphrase;
if (this.posofphrase < other.posofphrase) this.posofphrase = other.posofphrase;
if (this.worddistance < other.worddistance) this.worddistance = other.worddistance;
if (this.lastModified < other.lastModified) this.lastModified = other.lastModified;
if (this.quality < other.quality) this.quality = other.quality;
}
public void normalize(plasmaWordIndexEntry min, plasmaWordIndexEntry max) {
this.hitcount = (this.hitcount == 0) ? 0 : 1 + 255 * (this.hitcount - min.hitcount ) / (1 + max.hitcount - min.hitcount);
this.wordcount = (this.wordcount == 0) ? 0 : 1 + 255 * (this.wordcount - min.wordcount ) / (1 + max.wordcount - min.wordcount);
this.phrasecount = (this.phrasecount == 0) ? 0 : 1 + 255 * (this.phrasecount - min.phrasecount ) / (1 + max.phrasecount - min.phrasecount);
this.posintext = (this.posintext == 0) ? 0 : 1 + 255 * (this.posintext - min.posintext ) / (1 + max.posintext - min.posintext);
this.posinphrase = (this.posinphrase == 0) ? 0 : 1 + 255 * (this.posinphrase - min.posinphrase ) / (1 + max.posinphrase - min.posinphrase);
this.posofphrase = (this.posofphrase == 0) ? 0 : 1 + 255 * (this.posofphrase - min.posofphrase ) / (1 + max.posofphrase - min.posofphrase);
this.worddistance = (this.worddistance == 0) ? 0 : 1 + 255 * (this.worddistance - min.worddistance) / (1 + max.worddistance - min.worddistance);
this.lastModified = (this.lastModified == 0) ? 0 : 1 + 255 * (this.lastModified - min.lastModified) / (1 + max.lastModified - min.lastModified);
this.quality = (this.quality == 0) ? 0 : 1 + 255 * (this.quality - min.quality ) / (1 + max.quality - min.quality);
}
public String getUrlHash() { return urlHash; }
public int getQuality() { return quality; }
public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }

Loading…
Cancel
Save