// search.java // (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // You must compile this file with // javac -classpath .:../../Classes search.java // if the shell's current path is htroot/yacy import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.document.Autotagging.Metatag; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.EventTracker; import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.MemoryControl; import net.yacy.peers.EventChannel; import net.yacy.peers.Network; import net.yacy.peers.Protocol; import net.yacy.peers.Seed; import net.yacy.peers.graphics.ProfilingGraph; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segment; import net.yacy.search.index.Segments; import net.yacy.search.query.AccessTracker; import net.yacy.search.query.QueryParams; import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; import net.yacy.search.ranking.RankingProfile; import net.yacy.search.snippet.ContentDomain; import net.yacy.search.snippet.ResultEntry; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.tools.crypt; public final class search { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; sb.remoteSearchLastAccess = System.currentTimeMillis(); final serverObjects prop = new serverObjects(); // set nice default values for error cases prop.put("searchtime", "0"); prop.put("references", ""); prop.put("joincount", "0"); prop.put("linkcount", "0"); prop.put("links", ""); prop.put("indexcount", ""); prop.put("indexabstract", ""); prop.put("fwhop", ""); // hops (depth) of forwards that had been performed to construct this result prop.put("fwsrc", ""); // peers that helped to construct this result prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations) if (post == null || env == null) return prop; if (!Protocol.authentifyRequest(post, env)) return prop; final String client = header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); //System.out.println("yacy: search received request = " + post.toString()); final String oseed = post.get("myseed", ""); // complete seed of the requesting peer // final String youare = post.get("youare", ""); // seed hash of the target peer, used for testing network stability final String key = post.get("key", ""); // transmission key for response final String query = post.get("query", ""); // a string of word hashes that shall be searched and combined final String exclude= post.get("exclude", "");// a string of word hashes that shall not be within the search result final String urls = post.get("urls", ""); // a string of url hashes that are preselected for the search: no other may be returned final String abstracts = post.get("abstracts", ""); // a string of word hashes for abstracts that shall be generated, or 'auto' (for maxcount-word), or '' (for none) // final String fwdep = post.get("fwdep", ""); // forward depth. if "0" then peer may NOT ask another peer for more results // final String fwden = post.get("fwden", ""); // forward deny, a list of seed hashes. They may NOT be target of forward hopping final int count = Math.min((int) sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXCOUNT_DEFAULT, 100), post.getInt("count", 10)); // maximum number of wanted results final long maxtime = Math.min((int) sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXTIME_DEFAULT, 3000), post.getLong("time", 3000)); // maximum waiting time final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); final String prefer = post.get("prefer", ""); final String modifier = post.get("modifier", "").trim(); final String contentdom = post.get("contentdom", "text"); final String filter = post.get("filter", ".*"); // a filter on the url final Pattern snippetPattern = Pattern.compile(post.get("snippet", ".*")); // a filter on the snippet String sitehash = post.get("sitehash", ""); if (sitehash.length() == 0) sitehash = null; String authorhash = post.get("authorhash", ""); if (authorhash.length() == 0) authorhash = null; String language = post.get("language", ""); if (language == null || language.length() == 0 || !ISO639.exists(language)) { // take language from the user agent String agent = header.get("User-Agent"); if (agent == null) agent = System.getProperty("user.language"); language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent); if (language == null) language = "en"; } final int partitions = post.getInt("partitions", 30); String profile = post.get("profile", ""); // remote profile hand-over if (profile.length() > 0) profile = crypt.simpleDecode(profile, null); //final boolean includesnippet = post.get("includesnippet", "false").equals("true"); Bitfield constraint = ((post.containsKey("constraint")) && (post.get("constraint", "").length() > 0)) ? new Bitfield(4, post.get("constraint", "______")) : null; if (constraint != null) { // check bad handover parameter from older versions boolean allon = true; for (int i = 0; i < 32; i++) { if (!constraint.get(i)) {allon = false; break;} } if (allon) constraint = null; } // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time // test: // http://localhost:8090/yacy/search.html?query=4galTpdpDM5Q (search for linux) // http://localhost:8090/yacy/search.html?query=gh8DKIhGKXws (search for book) // http://localhost:8090/yacy/search.html?query=UEhMGfGv2vOE (search for kernel) // http://localhost:8090/yacy/search.html?query=ZX-LjaYo74PP (search for help) // http://localhost:8090/yacy/search.html?query=uDqIalxDfM2a (search for mail) // http://localhost:8090/yacy/search.html?query=4galTpdpDM5Qgh8DKIhGKXws&abstracts=auto (search for linux and book, generate abstract automatically) // http://localhost:8090/yacy/search.html?query=&abstracts=4galTpdpDM5Q (only abstracts for linux) if (sb.isRobinsonMode() && !sb.isPublicRobinson()) { // if we are a robinson cluster, answer only if this client is known by our network definition return prop; } // check the search tracker TreeSet trackerHandles = sb.remoteSearchTracker.get(client); if (trackerHandles == null) trackerHandles = new TreeSet(); boolean block = false; synchronized (trackerHandles) { if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size() > 1) { block = true; } } if (!block) synchronized (trackerHandles) { if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size() > 12) { block = true; } } if (!block) synchronized (trackerHandles) { if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size() > 36) { block = true; } } if (block && Domains.isLocal(client, null)) block = false; // check isLocal here to prevent dns lookup for client if (block) { return prop; } // tell all threads to do nothing for a specific time sb.intermissionAllThreads(100); EventTracker.delete(EventTracker.EClass.SEARCH); final HandleSet abstractSet = (abstracts.length() == 0 || abstracts.equals("auto")) ? null : QueryParams.hashes2Set(abstracts); // store accessing peer Seed remoteSeed; try { remoteSeed = Seed.genRemoteSeed(oseed, key, false, client); } catch (final IOException e) { Network.log.logInfo("yacy.search: access with bad seed: " + e.getMessage()); remoteSeed = null; } if (sb.peers == null) { Network.log.logSevere("yacy.search: seed cache not initialized"); } else { sb.peers.peerActions.peerArrival(remoteSeed, true); } // prepare search final HandleSet queryhashes = QueryParams.hashes2Set(query); final HandleSet excludehashes = (exclude.length() == 0) ? new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0) : QueryParams.hashes2Set(exclude); final long timestamp = System.currentTimeMillis(); // prepare a search profile final RankingProfile rankingProfile = (profile.length() == 0) ? new RankingProfile(ContentDomain.contentdomParser(contentdom)) : new RankingProfile("", profile); // prepare an abstract result final StringBuilder indexabstract = new StringBuilder(6000); int indexabstractContainercount = 0; int joincount = 0; QueryParams theQuery = null; SearchEvent theSearch = null; ArrayList> accu = null; if (query.length() == 0 && abstractSet != null) { // this is _not_ a normal search, only a request for index abstracts final Segment indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); theQuery = new QueryParams( null, abstractSet, new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0), null, snippetPattern, null, modifier, maxdist, prefer, ContentDomain.contentdomParser(contentdom), language, new HashSet(), "", // no navigation CacheStrategy.CACHEONLY, count, 0, filter, QueryParams.Searchdom.LOCAL, -1, null, false, sitehash, null, authorhash, DigestURI.TLD_any_zone_filter, client, false, indexSegment, rankingProfile, header.get(RequestHeader.USER_AGENT, ""), false ); Network.log.logInfo("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); final long timer = System.currentTimeMillis(); //final Map>[] containers = sb.indexSegment.index().searchTerm(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls)); final TreeMap> incc = indexSegment.termIndex().searchConjunction(theQuery.queryHashes, QueryParams.hashes2Handles(urls)); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theQuery.id(true), SearchEvent.Type.COLLECTION, "", incc.size(), System.currentTimeMillis() - timer), false); if (incc != null) { final Iterator>> ci = incc.entrySet().iterator(); Map.Entry> entry; byte[] wordhash; while (ci.hasNext()) { entry = ci.next(); wordhash = entry.getKey(); final ReferenceContainer container = entry.getValue(); indexabstractContainercount += container.size(); indexabstract.append("indexabstract."); indexabstract.append(ASCII.String(wordhash)); indexabstract.append("="); indexabstract.append(WordReferenceFactory.compressIndex(container, null, 1000).toString()); indexabstract.append(serverCore.CRLF_STRING); } } prop.put("indexcount", ""); prop.put("joincount", "0"); prop.put("references", ""); } else { // retrieve index containers from search request theQuery = new QueryParams( null, queryhashes, excludehashes, null, snippetPattern, null, modifier, maxdist, prefer, ContentDomain.contentdomParser(contentdom), language, new HashSet(), "", // no navigation CacheStrategy.CACHEONLY, count, 0, filter, QueryParams.Searchdom.LOCAL, -1, constraint, false, sitehash, null, authorhash, DigestURI.TLD_any_zone_filter, client, false, sb.indexSegments.segment(Segments.Process.PUBLIC), rankingProfile, header.get(RequestHeader.USER_AGENT, ""), false ); Network.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); EventChannel.channels(EventChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), "")); // make event theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, null, abstracts.length() > 0, sb.loader, count, maxtime, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); // set statistic details of search result and find best result index set joincount = theSearch.getRankingResult().getLocalIndexCount() - theSearch.getRankingResult().getMissCount() - theSearch.getRankingResult().getSortOutCount(); prop.put("joincount", Integer.toString(joincount)); if (joincount != 0) { accu = theSearch.result().completeResults(maxtime); } if (joincount <= 0 || abstracts.length() == 0) { prop.put("indexcount", ""); } else { // attach information about index abstracts final StringBuilder indexcount = new StringBuilder(6000); Map.Entry entry; final Iterator> i = theSearch.abstractsCount(); while (i.hasNext()) { entry = i.next(); indexcount.append("indexcount.").append(ASCII.String(entry.getKey())).append('=').append((entry.getValue()).toString()).append(serverCore.CRLF_STRING); } if (abstractSet != null) { // if a specific index-abstract is demanded, attach it here final Iterator j = abstractSet.iterator(); byte[] wordhash; while (j.hasNext()) { wordhash = j.next(); indexabstractContainercount += theSearch.abstractsCount(wordhash); indexabstract.append("indexabstract.").append(ASCII.String(wordhash)).append("=").append(theSearch.abstractsString(wordhash)).append(serverCore.CRLF_STRING); } } prop.put("indexcount", indexcount.toString()); // generate compressed index for maxcounthash // this is not needed if the search is restricted to specific // urls, because it is a re-search if ((theSearch.getAbstractsMaxCountHash() == null) || (urls.length() != 0) || (queryhashes.size() <= 1) || (abstracts.length() == 0)) { prop.put("indexabstract", ""); } else if (abstracts.equals("auto")) { // automatically attach the index abstract for the index that has the most references. This should be our target dht position indexabstractContainercount += theSearch.abstractsCount(theSearch.getAbstractsMaxCountHash()); indexabstract.append("indexabstract.").append(ASCII.String(theSearch.getAbstractsMaxCountHash())).append("=").append(theSearch.abstractsString(theSearch.getAbstractsMaxCountHash())).append(serverCore.CRLF_STRING); if ((theSearch.getAbstractsNearDHTHash() != null) && (!(ByteBuffer.equals(theSearch.getAbstractsNearDHTHash(), theSearch.getAbstractsMaxCountHash())))) { // in case that the neardhthash is different from the maxcounthash attach also the neardhthash-container indexabstractContainercount += theSearch.abstractsCount(theSearch.getAbstractsNearDHTHash()); indexabstract.append("indexabstract.").append(ASCII.String(theSearch.getAbstractsNearDHTHash())).append("=").append(theSearch.abstractsString(theSearch.getAbstractsNearDHTHash())).append(serverCore.CRLF_STRING); } //System.out.println("DEBUG-ABSTRACTGENERATION: maxcounthash = " + maxcounthash); //System.out.println("DEBUG-ABSTRACTGENERATION: neardhthash = "+ neardhthash); //yacyCore.log.logFine("DEBUG HASH SEARCH: " + indexabstract); } } if (partitions > 0) sb.searchQueriesGlobal += 1d / partitions; // increase query counter // prepare reference hints final long timer = System.currentTimeMillis(); final ScoreMap topicNavigator = theSearch.getTopicNavigator(5); final StringBuilder refstr = new StringBuilder(6000); final Iterator navigatorIterator = topicNavigator.keys(false); int i = 0; String name; while (i < 5 && navigatorIterator.hasNext()) { name = navigatorIterator.next(); refstr.append(",").append(name); i++; } prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString()); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theQuery.id(true), SearchEvent.Type.REFERENCECOLLECTION, "", i, System.currentTimeMillis() - timer), false); } prop.put("indexabstract", indexabstract.toString()); // prepare result if (joincount == 0 || accu == null || accu.isEmpty()) { // no results prop.put("links", ""); prop.put("linkcount", "0"); prop.put("references", ""); } else { // result is a List of urlEntry elements final long timer = System.currentTimeMillis(); final StringBuilder links = new StringBuilder(6000); String resource = null; WeakPriorityBlockingQueue.Element entry; for (int i = 0; i < accu.size(); i++) { entry = accu.get(i); resource = entry.getElement().resource(); if (resource != null) { links.append("resource").append(i).append('=').append(resource).append(serverCore.CRLF_STRING); } } theQuery.transmitcount = accu.size() + 1; prop.put("links", links.toString()); prop.put("linkcount", accu.size()); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theQuery.id(true), SearchEvent.Type.RESULTLIST, "", accu.size(), System.currentTimeMillis() - timer), false); } // prepare search statistics theQuery.remotepeer = client == null ? null : sb.peers.lookupByIP(Domains.dnsResolve(client), -1, true, false, false); theQuery.resultcount = (theSearch == null) ? 0 : joincount; theQuery.searchtime = System.currentTimeMillis() - timestamp; theQuery.urlretrievaltime = (theSearch == null) ? 0 : theSearch.result().getURLRetrievalTime(); theQuery.snippetcomputationtime = (theSearch == null) ? 0 : theSearch.result().getSnippetComputationTime(); AccessTracker.add(AccessTracker.Location.remote, theQuery); // update the search tracker synchronized (trackerHandles) { trackerHandles.add(theQuery.time); // thats the time when the handle was created // we don't need too much entries in the list; remove superfluous while (trackerHandles.size() > 36) if (!trackerHandles.remove(trackerHandles.first())) break; } sb.remoteSearchTracker.put(client, trackerHandles); if (MemoryControl.shortStatus()) sb.remoteSearchTracker.clear(); // log Network.log.logInfo("EXIT HASH SEARCH: " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + joincount + " links found, " + prop.get("linkcount", "?") + " links selected, " + indexabstractContainercount + " index abstracts, " + (System.currentTimeMillis() - timestamp) + " milliseconds"); prop.put("searchtime", System.currentTimeMillis() - timestamp); final int links = prop.getInt("linkcount",0); sb.peers.mySeed().incSI(links); sb.peers.mySeed().incSU(links); return prop; } }