// plasmaSearchEvent.java // (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 10.10.2005 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedRevision: 1986 $ // $LastChangedBy: orbiter $ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.plasma; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySearch; public final class plasmaSearchEvent { //public static plasmaSearchEvent lastEvent = null; public static String lastEventID = ""; private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes private long eventTime; private plasmaSearchQuery query; private plasmaSearchRankingProfile ranking; private plasmaWordIndex wordIndex; private indexContainer rcLocal; // cache for local results private indexContainer rcGlobal; // cache for global results private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation private plasmaSearchProcessing profileLocal, profileGlobal; private yacySearch[] primarySearchThreads, secondarySearchThreads; private TreeMap preselectedPeerHashes; private int localcount, globalcount; private indexContainer sortedResults; private int lastglobal; private int filteredCount; private ArrayList display; // an array of url hashes of urls that had been displayed as search result after this search private plasmaSearchEvent(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, plasmaSearchProcessing localTiming, plasmaSearchProcessing remoteTiming, plasmaWordIndex wordIndex, TreeMap preselectedPeerHashes) { this.eventTime = System.currentTimeMillis(); // for lifetime check this.wordIndex = wordIndex; this.query = query; this.ranking = ranking; this.rcLocal = null; this.rcGlobal = plasmaWordIndex.emptyContainer(null, 0);; this.rcAbstracts = (query.queryHashes.size() > 1) ? new TreeMap() : null; // generate abstracts only for combined searches this.profileLocal = localTiming; this.profileGlobal = remoteTiming; this.primarySearchThreads = null; this.secondarySearchThreads = null; this.preselectedPeerHashes = preselectedPeerHashes; this.localcount = 0; this.globalcount = 0; this.sortedResults = null; this.lastglobal = 0; this.display = new ArrayList(); long start = System.currentTimeMillis(); if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) || (query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) { int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds if (fetchpeers > 50) fetchpeers = 50; if (fetchpeers < 30) fetchpeers = 30; // do a global search // the result of the fetch is then in the rcGlobal serverLog.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS"); long secondaryTimeout = System.currentTimeMillis() + profileGlobal.duetime() / 3 * 2; long primaryTimeout = System.currentTimeMillis() + profileGlobal.duetime(); primarySearchThreads = yacySearch.primaryRemoteSearches( plasmaSearchQuery.hashSet2hashString(query.queryHashes), plasmaSearchQuery.hashSet2hashString(query.excludeHashes), "", query.prefer, query.urlMask, query.maxDistance, wordIndex, rcGlobal, rcAbstracts, fetchpeers, plasmaSwitchboard.urlBlacklist, profileGlobal, ranking, query.constraint, (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes); // meanwhile do a local search Map[] searchContainerMaps = profileLocal.localSearchContainers(query, wordIndex, null); // use the search containers to fill up rcAbstracts locally /* if ((rcAbstracts != null) && (searchContainerMap != null)) { Iterator i, ci = searchContainerMap.entrySet().iterator(); Map.Entry entry; String wordhash; indexContainer container; TreeMap singleAbstract; String mypeerhash = yacyCore.seedDB.mySeed.hash; while (ci.hasNext()) { entry = (Map.Entry) ci.next(); wordhash = (String) entry.getKey(); container = (indexContainer) entry.getValue(); // collect all urlhashes from the container synchronized (rcAbstracts) { singleAbstract = (TreeMap) rcAbstracts.get(wordhash); // a mapping from url-hashes to a string of peer-hashes if (singleAbstract == null) singleAbstract = new TreeMap(); i = container.entries(); while (i.hasNext()) singleAbstract.put(((indexEntry) i.next()).urlHash(), mypeerhash); rcAbstracts.put(wordhash, singleAbstract); } } } */ // join and exlcude the local result this.rcLocal = (searchContainerMaps == null) ? plasmaWordIndex.emptyContainer(null, 0) : profileLocal.localSearchJoinExclude( searchContainerMaps[0].values(), searchContainerMaps[1].values(), (query.queryHashes.size() == 0) ? 0 : profileLocal.getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) * query.queryHashes.size() / (query.queryHashes.size() + query.excludeHashes.size()), query.maxDistance); // sort the local containers and truncate it to a limited count, // so following sortings together with the global results will be fast plasmaSearchPreOrder firstsort = new plasmaSearchPreOrder(query, profileLocal, ranking, rcLocal); rcLocal = firstsort.strippedContainer(200); int prefetchIndex = 0; HashSet unknownURLs = new HashSet(); String urlhash; // while we wait for the first time-out for index abstracts, we fetch urls form the url-db while ((System.currentTimeMillis() < secondaryTimeout) && (prefetchIndex < rcLocal.size())) { if (yacySearch.remainingWaiting(primarySearchThreads) == 0) break; // all threads have finished urlhash = new String(rcLocal.get(prefetchIndex).getColBytes(0)); if (wordIndex.loadedURL.load(urlhash, null) == null) unknownURLs.add(urlhash); prefetchIndex++; } // eventually wait some more time to retrieve index abstracts from primary search while (System.currentTimeMillis() < secondaryTimeout) { if (yacySearch.remainingWaiting(primarySearchThreads) == 0) break; // all threads have finished try {Thread.sleep(100);} catch (InterruptedException e) {} } // evaluate index abstracts and start a secondary search if (rcAbstracts != null) prepareSecondarySearch(); // while we wait for the second time-out for index abstracts, we fetch more urls form the url-db while ((System.currentTimeMillis() < primaryTimeout) && (prefetchIndex < rcLocal.size())) { if (yacySearch.remainingWaiting(primarySearchThreads) == 0) break; // all threads have finished urlhash = new String(rcLocal.get(prefetchIndex).getColBytes(0)); if (wordIndex.loadedURL.load(urlhash, null) == null) unknownURLs.add(urlhash); prefetchIndex++; } // when we have found some non-existing urls in the local collection, we delete them now wordIndex.removeEntriesMultiple(query.queryHashes, unknownURLs); rcLocal.removeEntriesMultiple(query.queryHashes, unknownURLs); localcount = rcLocal.size(); // catch up global results: // wait until primary timeout passed while (System.currentTimeMillis() < primaryTimeout) { if ((yacySearch.remainingWaiting(primarySearchThreads) == 0) && ((secondarySearchThreads == null) || (yacySearch.remainingWaiting(secondarySearchThreads) == 0))) break; // all threads have finished try {Thread.sleep(100);} catch (InterruptedException e) {} } // finished searching serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); } else { Map[] searchContainerMaps = profileLocal.localSearchContainers(query, wordIndex, null); rcLocal = (searchContainerMaps == null) ? plasmaWordIndex.emptyContainer(null, 0) : profileLocal.localSearchJoinExclude( searchContainerMaps[0].values(), searchContainerMaps[1].values(), (query.queryHashes.size() == 0) ? 0 : profileLocal.getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) * query.queryHashes.size() / (query.queryHashes.size() + query.excludeHashes.size()), query.maxDistance); this.localcount = rcLocal.size(); } // log the event serverLog.logFine("SEARCH_EVENT", "SEARCHRESULT: " + profileLocal.reportToString()); // set link for statistic //lastEvent = this; // remove old events in the event cache Iterator i = lastEvents.entrySet().iterator(); while (i.hasNext()) { if (((plasmaSearchEvent) ((Map.Entry) i.next()).getValue()).eventTime + eventLifetime < System.currentTimeMillis()) i.remove(); } // store this search to a cache so it can be re-used lastEvents.put(query.id(), this); lastEventID = query.id(); } public plasmaSearchQuery getQuery() { return query; } public plasmaSearchRankingProfile getRanking() { return ranking; } public plasmaSearchProcessing getLocalTiming() { return profileLocal; } public yacySearch[] getPrimarySearchThreads() { return primarySearchThreads; } public yacySearch[] getSecondarySearchThreads() { return secondarySearchThreads; } public int getLocalCount() { return this.localcount; } public int getGlobalCount() { return this.globalcount; } public static plasmaSearchEvent getEvent(String eventID) { return (plasmaSearchEvent) lastEvents.get(eventID); } public static plasmaSearchEvent getEvent(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, plasmaSearchProcessing localTiming, plasmaSearchProcessing remoteTiming, plasmaWordIndex wordIndex, TreeMap preselectedPeerHashes) { plasmaSearchEvent event = (plasmaSearchEvent) lastEvents.get(query.id()); if (event == null) { event = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, wordIndex, preselectedPeerHashes); } else { //re-new the event time for this event, so it is not deleted next time too early event.eventTime = System.currentTimeMillis(); } return event; } public indexContainer search() { // combine the local and global (if any) result and order if ((rcGlobal != null) && (rcGlobal.size() > 0)) { globalcount = rcGlobal.size(); if ((this.sortedResults == null) || (this.lastglobal != globalcount)) { indexContainer searchResult = plasmaWordIndex.emptyContainer(null, rcLocal.size() + rcGlobal.size()); searchResult.addAllUnique(rcLocal); searchResult.addAllUnique(rcGlobal); searchResult.sort(); searchResult.uniq(100); lastglobal = globalcount; plasmaSearchPreOrder pre = new plasmaSearchPreOrder(query, profileLocal, ranking, searchResult); this.filteredCount = pre.filteredCount(); this.sortedResults = pre.strippedContainer(200); } } else { if (this.sortedResults == null) { plasmaSearchPreOrder pre = new plasmaSearchPreOrder(query, profileLocal, ranking, rcLocal); this.filteredCount = pre.filteredCount(); this.sortedResults = pre.strippedContainer(200); } } return this.sortedResults; } public int filteredCount() { return this.filteredCount; } private void prepareSecondarySearch() { // catch up index abstracts and join them; then call peers again to submit their urls System.out.println("DEBUG-INDEXABSTRACT: " + rcAbstracts.size() + " word references catched, " + query.queryHashes.size() + " needed"); if (rcAbstracts.size() != query.queryHashes.size()) return; // secondary search not possible Iterator i = rcAbstracts.entrySet().iterator(); Map.Entry entry; while (i.hasNext()) { entry = (Map.Entry) i.next(); System.out.println("DEBUG-INDEXABSTRACT: hash " + (String) entry.getKey() + ": " + ((query.queryHashes.contains((String) entry.getKey())) ? "NEEDED" : "NOT NEEDED") + "; " + ((TreeMap) entry.getValue()).size() + " entries"); } TreeMap abstractJoin = (rcAbstracts.size() == query.queryHashes.size()) ? kelondroMSetTools.joinConstructive(rcAbstracts.values(), true) : new TreeMap(); if (abstractJoin.size() == 0) { System.out.println("DEBUG-INDEXABSTRACT: no success using index abstracts from remote peers"); } else { System.out.println("DEBUG-INDEXABSTRACT: index abstracts delivered " + abstractJoin.size() + " additional results for secondary search"); // generate query for secondary search TreeMap secondarySearchURLs = new TreeMap(); // a (peerhash:urlhash-liststring) mapping Iterator i1 = abstractJoin.entrySet().iterator(); Map.Entry entry1; String url, urls, peer, peers; String mypeerhash = yacyCore.seedDB.mySeed.hash; boolean mypeerinvolved = false; int mypeercount; while (i1.hasNext()) { entry1 = (Map.Entry) i1.next(); url = (String) entry1.getKey(); peers = (String) entry1.getValue(); System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peers); mypeercount = 0; for (int j = 0; j < peers.length(); j = j + 12) { peer = peers.substring(j, j + 12); if ((peer.equals(mypeerhash)) && (mypeercount++ > 1)) continue; //if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin urls = (String) secondarySearchURLs.get(peer); urls = (urls == null) ? url : urls + url; secondarySearchURLs.put(peer, urls); } if (mypeercount == 1) mypeerinvolved = true; } // compute words for secondary search and start the secondary searches i1 = secondarySearchURLs.entrySet().iterator(); String words; secondarySearchThreads = new yacySearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs.size()]; int c = 0; while (i1.hasNext()) { entry1 = (Map.Entry) i1.next(); peer = (String) entry1.getKey(); if (peer.equals(mypeerhash)) continue; // we dont need to ask ourself urls = (String) entry1.getValue(); words = wordsFromPeer(peer, urls); System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls); System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words); secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch( words, "", urls, wordIndex, rcGlobal, peer, plasmaSwitchboard.urlBlacklist, profileGlobal, ranking, query.constraint, preselectedPeerHashes); } } } private String wordsFromPeer(String peerhash, String urls) { Map.Entry entry; String word, peerlist, url, wordlist = ""; TreeMap urlPeerlist; int p; boolean hasURL; synchronized (rcAbstracts) { Iterator i = rcAbstracts.entrySet().iterator(); while (i.hasNext()) { entry = (Map.Entry) i.next(); word = (String) entry.getKey(); urlPeerlist = (TreeMap) entry.getValue(); hasURL = true; for (int j = 0; j < urls.length(); j = j + 12) { url = urls.substring(j, j + 12); peerlist = (String) urlPeerlist.get(url); p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash); if ((p < 0) || (p % 12 != 0)) { hasURL = false; break; } } if (hasURL) wordlist += word; } } return wordlist; } public void remove(String urlhash) { // removes the url hash reference from last search result indexRWIEntry e = this.sortedResults.remove(urlhash); assert e != null; rcLocal.remove(urlhash); } public void displayed(String urlhash, int position) { this.display.set(position, urlhash); } }