enhancements to search result preparation

- added detailed count on remote search results
- enhanced search sequence during remote searches (doing local search in sequence)
- strict adherence to timout limits

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2497 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 5c1bb53d2a
commit 8a0e35618b

@ -1,10 +1,11 @@
version=#[version]#
uptime=#[uptime]#
count=#[linkcount]#
total=#[totalcount]#
fwhop=#[fwhop]#
fwsrc=#[fwsrc]#
fwrec=#[fwrec]#
searchtime=#[searchtime]#
references=#[references]#
#[links]#
joincount=#[joincount]#
count=#[linkcount]#
#[links]#
#[indexcount]#

@ -48,7 +48,11 @@
// if the shell's current path is htroot/yacy
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntryAttribute;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchEvent;
@ -117,18 +121,33 @@ public final class search {
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults);
plasmaSearchTimingProfile remoteTiming = null;
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
plasmaSearchResult acc = null;
int idxc = 0;
idxc = theSearch.localSearch();
acc = theSearch.order();
// result is a List of urlEntry elements
if ((idxc == 0) || (acc == null)) {
prop.put("totalcount", "0");
Set containers = theSearch.localSearchContainers();
indexContainer localResults = theSearch.localSearchJoin(containers);
int joincount = localResults.size();
plasmaSearchResult acc = theSearch.order(localResults);
// set statistic details of search result
prop.put("joincount", Integer.toString(joincount));
if (containers == null) {
prop.put("indexcount", "");
} else {
Iterator ci = containers.iterator();
StringBuffer indexcount = new StringBuffer();
while (ci.hasNext()) {
indexContainer container = (indexContainer) ci.next();
indexcount.append("indexcount.").append(container.getWordHash()).append('=').append(Integer.toString(container.size())).append(serverCore.crlfString);
}
prop.put("indexcount", new String(indexcount));
}
if ((joincount == 0) || (acc == null)) {
prop.put("links", "");
prop.put("linkcount", "0");
prop.put("references", "");
} else {
prop.put("totalcount", Integer.toString(acc.sizeOrdered()));
// result is a List of urlEntry elements
int i = 0;
StringBuffer links = new StringBuffer();
String resource = "";
@ -147,12 +166,12 @@ public final class search {
resource = urlentry.toString();
}
if (resource != null) {
links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString);
links.append("resource").append(i).append('=').append(resource).append(serverCore.crlfString);
i++;
}
}
}
prop.put("links", links.toString());
prop.put("links", new String(links));
prop.put("linkcount", Integer.toString(i));
// prepare reference hints
@ -161,17 +180,15 @@ public final class search {
for (int j = 0; j < ws.length; j++)
refstr.append(",").append((String) ws[j]);
prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString());
// add information about forward peers
prop.put("fwhop", ""); // hops (depth) of forwards that had been performed to construct this result
prop.put("fwsrc", ""); // peers that helped to construct this result
prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations)
}
// add information about forward peers
prop.put("fwhop", ""); // hops (depth) of forwards that had been performed to construct this result
prop.put("fwsrc", ""); // peers that helped to construct this result
prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations)
// log
yacyCore.log.logInfo("EXIT HASH SEARCH: " + squery.queryHashes + " - " + idxc + " links found, " + prop.get("linkcount", "?") + " links selected, " + ((System.currentTimeMillis() - timestamp1) / 1000) + " seconds");
yacyCore.log.logInfo("EXIT HASH SEARCH: " + squery.queryHashes + " - " + joincount + " links found, " + prop.get("linkcount", "?") + " links selected, " + ((System.currentTimeMillis() - timestamp1) / 1000) + " seconds");
prop.put("searchtime", Long.toString(System.currentTimeMillis() - timestamp));

@ -49,7 +49,6 @@ import java.io.IOException;
import de.anomic.kelondro.kelondroException;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverInstantThread;
import de.anomic.yacy.yacySearch;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
@ -67,7 +66,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
private plasmaWordIndex wordIndex;
private plasmaCrawlLURL urlStore;
private plasmaSnippetCache snippetCache;
private indexContainer rcLocal, rcGlobal; // caches for results
private indexContainer rcGlobal; // cache for results
private int rcGlobalCount;
private plasmaSearchTimingProfile profileLocal, profileGlobal;
private yacySearch[] searchThreads;
@ -86,7 +85,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
this.ranking = ranking;
this.urlStore = urlStore;
this.snippetCache = snippetCache;
this.rcLocal = new indexRowSetContainer(null);
this.rcGlobal = new indexRowSetContainer(null);
this.rcGlobalCount = 0;
this.profileLocal = localTiming;
@ -121,38 +119,50 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// remember time
long start = System.currentTimeMillis();
// first trigger a local search within a separate thread
serverInstantThread.oneTimeJob(this, "localSearch", log, 0);
// do a global search
int globalContributions = globalSearch(fetchpeers);
// the result of the fetch is then in the rcGlobal
if (fetchpeers < 10) fetchpeers = 10;
log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS");
long timeout = System.currentTimeMillis() + profileGlobal.duetime();
searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);
// meanwhile do a local search
indexContainer rcLocal = localSearchJoin(localSearchContainers());
plasmaSearchResult localResult = orderLocal(rcLocal, timeout);
// catch up global results:
// wait until wanted delay passed or wanted result appeared
while (System.currentTimeMillis() < timeout) {
// check if all threads have been finished or results so far are enough
//if (rcGlobal.size() >= profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) * 5) break; // we have enough
if (yacySearch.remainingWaiting(searchThreads) == 0) break; // we cannot expect more
// wait a little time ..
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
int globalContributions = rcGlobal.size();
// finished searching
log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
// combine the result and order
plasmaSearchResult result = order();
plasmaSearchResult result = ((globalContributions == 0) && (localResult.sizeOrdered() != 0)) ? localResult : order(rcLocal);
result.globalContributions = globalContributions;
result.localContributions = rcLocal.size();
flushGlobalResults(); // make these values available for immediate next search
// flush results in a separate thread
this.start(); // start to flush results
// serverInstantThread.oneTimeJob(this, "flushResults", log, 0);
// clean up
rcLocal = null;
// return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this;
return result;
} else {
localSearch();
plasmaSearchResult result = order();
indexContainer rcLocal = localSearchJoin(localSearchContainers());
plasmaSearchResult result = order(rcLocal);
result.localContributions = rcLocal.size();
// clean up
rcLocal = null;
// return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this;
@ -160,9 +170,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
}
}
}
public int localSearch() {
// search for the set of hashes and return an array of urlEntry elements
public Set localSearchContainers() {
// search for the set of hashes and return the set of containers containing the seach result
// retrieve entities that belong to the hashes
profileLocal.startTimer();
@ -175,48 +185,29 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_COLLECTION);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_COLLECTION, (containers == null) ? 0 : containers.size());
// since this is a conjunction we return an empty entity if any word
// is not known
return containers;
}
public indexContainer localSearchJoin(Set containers) {
// join a search result and return the joincount (number of pages after join)
// since this is a conjunction we return an empty entity if any word is not known
if (containers == null) {
rcLocal = new indexRowSetContainer(null);
return 0;
return new indexRowSetContainer(null);
}
// join the result
profileLocal.startTimer();
rcLocal = indexRowSetContainer.joinContainer(containers,
indexContainer rcLocal = indexRowSetContainer.joinContainer(containers,
profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_JOIN),
query.maxDistance);
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_JOIN);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_JOIN, rcLocal.size());
return rcLocal.size();
return rcLocal;
}
public int globalSearch(int fetchpeers) {
// do global fetching
// the result of the fetch is then in the rcGlobal
if (fetchpeers < 10) fetchpeers = 10;
log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS");
long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000;
searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);
// wait until wanted delay passed or wanted result appeared
while (System.currentTimeMillis() < timeout) {
// check if all threads have been finished or results so far are enough
if (rcGlobal.size() >= profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) * 5) break; // we have enough
if (yacySearch.remainingWaiting(searchThreads) == 0) break; // we cannot expect more
// wait a little time ..
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
return rcGlobal.size();
}
public plasmaSearchResult order() {
public plasmaSearchResult order(indexContainer rcLocal) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
@ -247,7 +238,66 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try {
while (preorder.hasNext()) {
if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break;
//if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break;
if (acc.sizeFetched() >= minEntries) break;
if (System.currentTimeMillis() >= postorderLimitTime) break;
entry = preorder.next();
// find the url entry
try {
page = urlStore.getEntry(entry.urlHash(), entry);
// add a result
acc.addResult(entry, page);
} catch (IOException e) {
// result was not found
}
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
}
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_URLFETCH);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_URLFETCH, acc.sizeFetched());
// start postsorting
profileLocal.startTimer();
acc.sortResults();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_POSTSORT);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_POSTSORT, acc.sizeOrdered());
// apply filter
profileLocal.startTimer();
//acc.removeRedundant();
acc.removeDoubleDom();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());
return acc;
}
private plasmaSearchResult orderLocal(indexContainer rcLocal, long maxtime) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
profileLocal.startTimer();
if (maxtime < 0) maxtime = 200;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking);
preorder.addContainer(rcLocal, maxtime);
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size());
// start url-fetch
maxtime = Math.max(200, maxtime - profileLocal.getYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT));
long postorderLimitTime = System.currentTimeMillis() + maxtime;
profileLocal.startTimer();
plasmaSearchResult acc = new plasmaSearchResult(query, ranking);
indexEntry entry;
plasmaCrawlLURL.Entry page;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try {
while (preorder.hasNext()) {
//if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break;
if (acc.sizeFetched() >= minEntries) break;
if (System.currentTimeMillis() >= postorderLimitTime) break;
entry = preorder.next();
// find the url entry
try {
@ -300,7 +350,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
log.logFine("SEARCH FLUSH: " + remaining + " PEERS STILL BUSY; ABANDONED; SEARCH WAS " + query.queryWords);
break;
}
log.logFine("FINISHED FLUSH RESULTS PROCESS for query " + query.hashes(","));
//log.logFine("FINISHED FLUSH RESULTS PROCESS for query " + query.hashes(","));
}
serverLog.logFine("PLASMA", "FINISHED FLUSHING " + rcGlobalCount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords);
@ -319,11 +369,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
synchronized (rcGlobal) {
String wordHash;
Iterator hashi = query.queryHashes.iterator();
boolean dhtCache = false;
while (hashi.hasNext()) {
wordHash = (String) hashi.next();
rcGlobal.setWordHash(wordHash);
wordIndex.addEntries(rcGlobal, System.currentTimeMillis(), false);
log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries");
dhtCache = dhtCache | wordIndex.busyCacheFlush;
wordIndex.addEntries(rcGlobal, System.currentTimeMillis(), dhtCache);
log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries to " + ((dhtCache) ? "DHT cache" : "word cache"));
}
// the rcGlobal was flushed, empty it
count += rcGlobal.size();

@ -198,8 +198,7 @@ public class yacySearch extends Thread {
searchThreads[i]= new yacySearch(wordhashes, prefer, filter, maxDist, true, targetPeers[i],
urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile);
searchThreads[i].start();
try {Thread.sleep(20);} catch (InterruptedException e) {}
//try {Thread.sleep(20);} catch (InterruptedException e) {}
}
return searchThreads;
}

Loading…
Cancel
Save