diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 684ffdff5..161bb23b6 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -64,6 +64,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { private plasmaCrawlLURL urlStore; private plasmaSnippetCache snippetCache; private plasmaWordIndexEntryContainer rcLocal, rcGlobal; // caches for results + private int rcGlobalCount; private plasmaSearchProfile profileLocal, profileGlobal; private yacySearch[] searchThreads; @@ -75,6 +76,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { this.snippetCache = snippetCache; this.rcLocal = new plasmaWordIndexEntryContainer(null); this.rcGlobal = new plasmaWordIndexEntryContainer(null); + this.rcGlobalCount = 0; if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) { this.profileLocal = new plasmaSearchProfile(4 * query.maximumTime / 10, query.wantedResults); this.profileGlobal = new plasmaSearchProfile(6 * query.maximumTime / 10, query.wantedResults); @@ -118,7 +120,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { plasmaSearchResult result = order(); result.globalContributions = globalContributions; result.localContributions = rcLocal.size(); - flushResults(); + flushGlobalResults(); // make these values available for immediate next search // flush results in a separate thread this.start(); // start to flush results @@ -184,7 +186,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // wait until wanted delay passed or wanted result appeared while (System.currentTimeMillis() < timeout) { // check if all threads have been finished or results so far are enough - if (rcGlobal.size() >= profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) * 3) break; // we have enough + if (rcGlobal.size() >= profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) * 5) break; // we have enough if (yacySearch.remainingWaiting(searchThreads) == 0) break; // we cannot expect more // wait a little time .. try {Thread.sleep(100);} catch (InterruptedException e) {} @@ -264,10 +266,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // this method waits until all threads are finished int remaining; - int allcount = 0; long starttime = System.currentTimeMillis(); while ((searchThreads != null) && ((remaining = yacySearch.remainingWaiting(searchThreads)) > 0)) { - allcount += flushResults(); + flushGlobalResults(); // wait a little bit before trying again try {Thread.sleep(3000);} catch (InterruptedException e) {} @@ -279,7 +280,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { log.logFine("FINISHED FLUSH RESULTS PROCESS for query " + query.hashes(",")); } - serverLog.logFine("PLASMA", "FINISHED FLUSHING " + allcount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords); + serverLog.logFine("PLASMA", "FINISHED FLUSHING " + rcGlobalCount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords); // finally delete the temporary index rcGlobal = null; @@ -287,7 +288,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { flushThreads.remove(this); } - public int flushResults() { + public void flushGlobalResults() { // flush the rcGlobal as much as is there so far // this must be called sometime after search results had been computed int count = 0; @@ -306,7 +307,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { rcGlobal.clear(); } } - return count; + rcGlobalCount += count; } } diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 90ebb0af6..ba1a759f6 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -56,6 +56,7 @@ public final class plasmaSearchPreOrder { public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; + private plasmaWordIndexEntry entryMin, entryMax; private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private plasmaSearchQuery query; @@ -96,6 +97,8 @@ public final class plasmaSearchPreOrder { } public plasmaSearchPreOrder(plasmaSearchQuery query) { + entryMin = null; + entryMax = null; this.pageAcc = new TreeMap(); this.query = query; } @@ -116,6 +119,7 @@ public final class plasmaSearchPreOrder { return (plasmaWordIndexEntry) pageAcc.remove(top); } + /* public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) { Iterator i = container.entries(); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; @@ -142,9 +146,48 @@ public final class plasmaSearchPreOrder { ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance()); pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry); } + */ + + public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) { + long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; + plasmaWordIndexEntry entry; + + // first pass: find min/max to obtain limits for normalization + Iterator i = container.entries(); + int count = 0; + while (i.hasNext()) { + if (System.currentTimeMillis() > limitTime) break; + entry = (plasmaWordIndexEntry) i.next(); + if (entryMin == null) entryMin = (plasmaWordIndexEntry) entry.clone(); else entryMin.min(entry); + if (entryMax == null) entryMax = (plasmaWordIndexEntry) entry.clone(); else entryMax.max(entry); + count++; + } + + // second pass: normalize entries + i = container.entries(); + for (int j = 0; j < count; j++) { + entry = (plasmaWordIndexEntry) i.next(); + entry.normalize(entryMin, entryMax); + addEntry(entry); + } + } + + public void addEntry(plasmaWordIndexEntry indexEntry) { + long ranking = 0; + + for (int i = 0; i < 3; i++) { + if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality() << (4 * (3 - i)); + else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += indexEntry.getVirtualAge() << (4 * (3 - i)); + else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += ybr_p(indexEntry.getUrlHash()) << (4 * (3 - i)); + } + ranking += (indexEntry.posintext() == 0) ? 0 : (255 - indexEntry.posintext()) << 11; + ranking += (indexEntry.worddistance() == 0) ? 0 : (255 - indexEntry.worddistance()) << 10; + ranking += (indexEntry.hitcount() == 0) ? 0 : indexEntry.hitcount() << 9; + pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry); + } public static int ybr_p(String urlHash) { - return 16 - ybr(urlHash); + return 16 * (16 - ybr(urlHash)); } public static int ybr(String urlHash) { diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index 9cec2ac34..4d023f0bf 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -56,7 +56,7 @@ import de.anomic.server.serverCodings; import de.anomic.yacy.yacySeedDB; // import de.anomic.server.logging.serverLog; -public final class plasmaWordIndexEntry { +public final class plasmaWordIndexEntry implements Cloneable { // an wordEntry can be filled in either of two ways: // by the discrete values of the entry @@ -297,6 +297,10 @@ public final class plasmaWordIndexEntry { this.localflag = pr.getProperty("f", ""+LT_LOCAL).charAt(0); } + public Object clone() { + return new plasmaWordIndexEntry(this.toExternalForm()); + } + public String toEncodedForm() { // attention: this integrates NOT the URL hash into the encoding // if you need a complete dump, use toExternalForm() @@ -348,6 +352,42 @@ public final class plasmaWordIndexEntry { this.wordcount = (this.wordcount + oe.wordcount) / 2; } + public void min(plasmaWordIndexEntry other) { + if (this.hitcount > other.hitcount) this.hitcount = other.hitcount; + if (this.wordcount > other.wordcount) this.wordcount = other.wordcount; + if (this.phrasecount > other.phrasecount) this.phrasecount = other.phrasecount; + if (this.posintext > other.posintext) this.posintext = other.posintext; + if (this.posinphrase > other.posinphrase) this.posinphrase = other.posinphrase; + if (this.posofphrase > other.posofphrase) this.posofphrase = other.posofphrase; + if (this.worddistance > other.worddistance) this.worddistance = other.worddistance; + if (this.lastModified > other.lastModified) this.lastModified = other.lastModified; + if (this.quality > other.quality) this.quality = other.quality; + } + + public void max(plasmaWordIndexEntry other) { + if (this.hitcount < other.hitcount) this.hitcount = other.hitcount; + if (this.wordcount < other.wordcount) this.wordcount = other.wordcount; + if (this.phrasecount < other.phrasecount) this.phrasecount = other.phrasecount; + if (this.posintext < other.posintext) this.posintext = other.posintext; + if (this.posinphrase < other.posinphrase) this.posinphrase = other.posinphrase; + if (this.posofphrase < other.posofphrase) this.posofphrase = other.posofphrase; + if (this.worddistance < other.worddistance) this.worddistance = other.worddistance; + if (this.lastModified < other.lastModified) this.lastModified = other.lastModified; + if (this.quality < other.quality) this.quality = other.quality; + } + + public void normalize(plasmaWordIndexEntry min, plasmaWordIndexEntry max) { + this.hitcount = (this.hitcount == 0) ? 0 : 1 + 255 * (this.hitcount - min.hitcount ) / (1 + max.hitcount - min.hitcount); + this.wordcount = (this.wordcount == 0) ? 0 : 1 + 255 * (this.wordcount - min.wordcount ) / (1 + max.wordcount - min.wordcount); + this.phrasecount = (this.phrasecount == 0) ? 0 : 1 + 255 * (this.phrasecount - min.phrasecount ) / (1 + max.phrasecount - min.phrasecount); + this.posintext = (this.posintext == 0) ? 0 : 1 + 255 * (this.posintext - min.posintext ) / (1 + max.posintext - min.posintext); + this.posinphrase = (this.posinphrase == 0) ? 0 : 1 + 255 * (this.posinphrase - min.posinphrase ) / (1 + max.posinphrase - min.posinphrase); + this.posofphrase = (this.posofphrase == 0) ? 0 : 1 + 255 * (this.posofphrase - min.posofphrase ) / (1 + max.posofphrase - min.posofphrase); + this.worddistance = (this.worddistance == 0) ? 0 : 1 + 255 * (this.worddistance - min.worddistance) / (1 + max.worddistance - min.worddistance); + this.lastModified = (this.lastModified == 0) ? 0 : 1 + 255 * (this.lastModified - min.lastModified) / (1 + max.lastModified - min.lastModified); + this.quality = (this.quality == 0) ? 0 : 1 + 255 * (this.quality - min.quality ) / (1 + max.quality - min.quality); + } + public String getUrlHash() { return urlHash; } public int getQuality() { return quality; } public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }