From 8a0e35618b7a216955fb8d8be3228d02c18106d1 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 6 Sep 2006 17:51:28 +0000 Subject: [PATCH] enhancements to search result preparation - added detailed count on remote search results - enhanced search sequence during remote searches (doing local search in sequence) - strict adherence to timout limits git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2497 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/search.html | 7 +- htroot/yacy/search.java | 55 ++++-- .../de/anomic/plasma/plasmaSearchEvent.java | 164 ++++++++++++------ source/de/anomic/yacy/yacySearch.java | 3 +- 4 files changed, 149 insertions(+), 80 deletions(-) diff --git a/htroot/yacy/search.html b/htroot/yacy/search.html index ddb735807..b6e8ec7c0 100644 --- a/htroot/yacy/search.html +++ b/htroot/yacy/search.html @@ -1,10 +1,11 @@ version=#[version]# uptime=#[uptime]# -count=#[linkcount]# -total=#[totalcount]# fwhop=#[fwhop]# fwsrc=#[fwsrc]# fwrec=#[fwrec]# searchtime=#[searchtime]# references=#[references]# -#[links]# \ No newline at end of file +joincount=#[joincount]# +count=#[linkcount]# +#[links]# +#[indexcount]# \ No newline at end of file diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 19d6e8531..b32cde756 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -48,7 +48,11 @@ // if the shell's current path is htroot/yacy import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + import de.anomic.http.httpHeader; +import de.anomic.index.indexContainer; import de.anomic.index.indexEntryAttribute; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchEvent; @@ -117,18 +121,33 @@ public final class search { plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults); plasmaSearchTimingProfile remoteTiming = null; plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache); - plasmaSearchResult acc = null; - int idxc = 0; - idxc = theSearch.localSearch(); - acc = theSearch.order(); - - // result is a List of urlEntry elements - if ((idxc == 0) || (acc == null)) { - prop.put("totalcount", "0"); + Set containers = theSearch.localSearchContainers(); + indexContainer localResults = theSearch.localSearchJoin(containers); + int joincount = localResults.size(); + plasmaSearchResult acc = theSearch.order(localResults); + + // set statistic details of search result + prop.put("joincount", Integer.toString(joincount)); + if (containers == null) { + prop.put("indexcount", ""); + } else { + Iterator ci = containers.iterator(); + StringBuffer indexcount = new StringBuffer(); + while (ci.hasNext()) { + indexContainer container = (indexContainer) ci.next(); + indexcount.append("indexcount.").append(container.getWordHash()).append('=').append(Integer.toString(container.size())).append(serverCore.crlfString); + } + prop.put("indexcount", new String(indexcount)); + } + + + if ((joincount == 0) || (acc == null)) { + prop.put("links", ""); prop.put("linkcount", "0"); prop.put("references", ""); } else { - prop.put("totalcount", Integer.toString(acc.sizeOrdered())); + + // result is a List of urlEntry elements int i = 0; StringBuffer links = new StringBuffer(); String resource = ""; @@ -147,12 +166,12 @@ public final class search { resource = urlentry.toString(); } if (resource != null) { - links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString); + links.append("resource").append(i).append('=').append(resource).append(serverCore.crlfString); i++; } } } - prop.put("links", links.toString()); + prop.put("links", new String(links)); prop.put("linkcount", Integer.toString(i)); // prepare reference hints @@ -161,17 +180,15 @@ public final class search { for (int j = 0; j < ws.length; j++) refstr.append(",").append((String) ws[j]); prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString()); - - // add information about forward peers - prop.put("fwhop", ""); // hops (depth) of forwards that had been performed to construct this result - prop.put("fwsrc", ""); // peers that helped to construct this result - prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations) - - } + // add information about forward peers + prop.put("fwhop", ""); // hops (depth) of forwards that had been performed to construct this result + prop.put("fwsrc", ""); // peers that helped to construct this result + prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations) + // log - yacyCore.log.logInfo("EXIT HASH SEARCH: " + squery.queryHashes + " - " + idxc + " links found, " + prop.get("linkcount", "?") + " links selected, " + ((System.currentTimeMillis() - timestamp1) / 1000) + " seconds"); + yacyCore.log.logInfo("EXIT HASH SEARCH: " + squery.queryHashes + " - " + joincount + " links found, " + prop.get("linkcount", "?") + " links selected, " + ((System.currentTimeMillis() - timestamp1) / 1000) + " seconds"); prop.put("searchtime", Long.toString(System.currentTimeMillis() - timestamp)); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 77450ae72..bb760f184 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -49,7 +49,6 @@ import java.io.IOException; import de.anomic.kelondro.kelondroException; import de.anomic.server.logging.serverLog; -import de.anomic.server.serverInstantThread; import de.anomic.yacy.yacySearch; import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; @@ -67,7 +66,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { private plasmaWordIndex wordIndex; private plasmaCrawlLURL urlStore; private plasmaSnippetCache snippetCache; - private indexContainer rcLocal, rcGlobal; // caches for results + private indexContainer rcGlobal; // cache for results private int rcGlobalCount; private plasmaSearchTimingProfile profileLocal, profileGlobal; private yacySearch[] searchThreads; @@ -86,7 +85,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable { this.ranking = ranking; this.urlStore = urlStore; this.snippetCache = snippetCache; - this.rcLocal = new indexRowSetContainer(null); this.rcGlobal = new indexRowSetContainer(null); this.rcGlobalCount = 0; this.profileLocal = localTiming; @@ -121,38 +119,50 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // remember time long start = System.currentTimeMillis(); - // first trigger a local search within a separate thread - serverInstantThread.oneTimeJob(this, "localSearch", log, 0); - // do a global search - int globalContributions = globalSearch(fetchpeers); + // the result of the fetch is then in the rcGlobal + if (fetchpeers < 10) fetchpeers = 10; + + log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS"); + + long timeout = System.currentTimeMillis() + profileGlobal.duetime(); + searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); + + // meanwhile do a local search + indexContainer rcLocal = localSearchJoin(localSearchContainers()); + plasmaSearchResult localResult = orderLocal(rcLocal, timeout); + + // catch up global results: + // wait until wanted delay passed or wanted result appeared + while (System.currentTimeMillis() < timeout) { + // check if all threads have been finished or results so far are enough + //if (rcGlobal.size() >= profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) * 5) break; // we have enough + if (yacySearch.remainingWaiting(searchThreads) == 0) break; // we cannot expect more + // wait a little time .. + try {Thread.sleep(100);} catch (InterruptedException e) {} + } + int globalContributions = rcGlobal.size(); + + // finished searching log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); // combine the result and order - plasmaSearchResult result = order(); + plasmaSearchResult result = ((globalContributions == 0) && (localResult.sizeOrdered() != 0)) ? localResult : order(rcLocal); result.globalContributions = globalContributions; result.localContributions = rcLocal.size(); - flushGlobalResults(); // make these values available for immediate next search - + // flush results in a separate thread this.start(); // start to flush results - // serverInstantThread.oneTimeJob(this, "flushResults", log, 0); - - // clean up - rcLocal = null; // return search result log.logFine("SEARCHRESULT: " + profileLocal.reportToString()); lastEvent = this; return result; } else { - localSearch(); - plasmaSearchResult result = order(); + indexContainer rcLocal = localSearchJoin(localSearchContainers()); + plasmaSearchResult result = order(rcLocal); result.localContributions = rcLocal.size(); - // clean up - rcLocal = null; - // return search result log.logFine("SEARCHRESULT: " + profileLocal.reportToString()); lastEvent = this; @@ -160,9 +170,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { } } } - - public int localSearch() { - // search for the set of hashes and return an array of urlEntry elements + + public Set localSearchContainers() { + // search for the set of hashes and return the set of containers containing the seach result // retrieve entities that belong to the hashes profileLocal.startTimer(); @@ -175,48 +185,29 @@ public final class plasmaSearchEvent extends Thread implements Runnable { profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_COLLECTION); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_COLLECTION, (containers == null) ? 0 : containers.size()); - // since this is a conjunction we return an empty entity if any word - // is not known + return containers; + } + + public indexContainer localSearchJoin(Set containers) { + // join a search result and return the joincount (number of pages after join) + + // since this is a conjunction we return an empty entity if any word is not known if (containers == null) { - rcLocal = new indexRowSetContainer(null); - return 0; + return new indexRowSetContainer(null); } // join the result profileLocal.startTimer(); - rcLocal = indexRowSetContainer.joinContainer(containers, + indexContainer rcLocal = indexRowSetContainer.joinContainer(containers, profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_JOIN), query.maxDistance); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_JOIN); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_JOIN, rcLocal.size()); - return rcLocal.size(); - + return rcLocal; } - public int globalSearch(int fetchpeers) { - // do global fetching - // the result of the fetch is then in the rcGlobal - if (fetchpeers < 10) fetchpeers = 10; - - log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS"); - - long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000; - searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); - - // wait until wanted delay passed or wanted result appeared - while (System.currentTimeMillis() < timeout) { - // check if all threads have been finished or results so far are enough - if (rcGlobal.size() >= profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) * 5) break; // we have enough - if (yacySearch.remainingWaiting(searchThreads) == 0) break; // we cannot expect more - // wait a little time .. - try {Thread.sleep(100);} catch (InterruptedException e) {} - } - - return rcGlobal.size(); - } - - public plasmaSearchResult order() { + public plasmaSearchResult order(indexContainer rcLocal) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime @@ -247,7 +238,66 @@ public final class plasmaSearchEvent extends Thread implements Runnable { int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); try { while (preorder.hasNext()) { - if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break; + //if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break; + if (acc.sizeFetched() >= minEntries) break; + if (System.currentTimeMillis() >= postorderLimitTime) break; + entry = preorder.next(); + // find the url entry + try { + page = urlStore.getEntry(entry.urlHash(), entry); + // add a result + acc.addResult(entry, page); + } catch (IOException e) { + // result was not found + } + } + } catch (kelondroException ee) { + serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); + } + profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_URLFETCH); + profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_URLFETCH, acc.sizeFetched()); + + // start postsorting + profileLocal.startTimer(); + acc.sortResults(); + profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_POSTSORT); + profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_POSTSORT, acc.sizeOrdered()); + + // apply filter + profileLocal.startTimer(); + //acc.removeRedundant(); + acc.removeDoubleDom(); + profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER); + profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered()); + + return acc; + } + + private plasmaSearchResult orderLocal(indexContainer rcLocal, long maxtime) { + // we collect the urlhashes and construct a list with urlEntry objects + // attention: if minEntries is too high, this method will not terminate within the maxTime + + profileLocal.startTimer(); + if (maxtime < 0) maxtime = 200; + plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking); + preorder.addContainer(rcLocal, maxtime); + profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT); + profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size()); + + // start url-fetch + maxtime = Math.max(200, maxtime - profileLocal.getYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT)); + long postorderLimitTime = System.currentTimeMillis() + maxtime; + profileLocal.startTimer(); + plasmaSearchResult acc = new plasmaSearchResult(query, ranking); + + indexEntry entry; + plasmaCrawlLURL.Entry page; + int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); + try { + while (preorder.hasNext()) { + //if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break; + if (acc.sizeFetched() >= minEntries) break; + if (System.currentTimeMillis() >= postorderLimitTime) break; entry = preorder.next(); // find the url entry try { @@ -300,7 +350,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { log.logFine("SEARCH FLUSH: " + remaining + " PEERS STILL BUSY; ABANDONED; SEARCH WAS " + query.queryWords); break; } - log.logFine("FINISHED FLUSH RESULTS PROCESS for query " + query.hashes(",")); + //log.logFine("FINISHED FLUSH RESULTS PROCESS for query " + query.hashes(",")); } serverLog.logFine("PLASMA", "FINISHED FLUSHING " + rcGlobalCount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords); @@ -319,11 +369,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable { synchronized (rcGlobal) { String wordHash; Iterator hashi = query.queryHashes.iterator(); + boolean dhtCache = false; while (hashi.hasNext()) { wordHash = (String) hashi.next(); rcGlobal.setWordHash(wordHash); - wordIndex.addEntries(rcGlobal, System.currentTimeMillis(), false); - log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries"); + dhtCache = dhtCache | wordIndex.busyCacheFlush; + wordIndex.addEntries(rcGlobal, System.currentTimeMillis(), dhtCache); + log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries to " + ((dhtCache) ? "DHT cache" : "word cache")); } // the rcGlobal was flushed, empty it count += rcGlobal.size(); diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index e4fc35f41..08fbd905b 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -198,8 +198,7 @@ public class yacySearch extends Thread { searchThreads[i]= new yacySearch(wordhashes, prefer, filter, maxDist, true, targetPeers[i], urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile); searchThreads[i].start(); - try {Thread.sleep(20);} catch (InterruptedException e) {} - + //try {Thread.sleep(20);} catch (InterruptedException e) {} } return searchThreads; }