From cf9884e22b03c7431dcebf9aa22b84069ac78249 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 13 Sep 2006 17:13:28 +0000 Subject: [PATCH] first attempt to implement a secondary search this is a set of search processes that shall enrich search results with specialized requests to realize a combination of search results from different peers. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2571 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/search.java | 15 +- .../de/anomic/kelondro/kelondroMSetTools.java | 166 +++++++++--------- source/de/anomic/plasma/plasmaGrafics.java | 24 ++- .../de/anomic/plasma/plasmaSearchEvent.java | 158 ++++++++++++----- .../de/anomic/plasma/plasmaSearchQuery.java | 32 +++- .../plasma/plasmaSearchRankingProfile.java | 4 +- source/de/anomic/yacy/yacyClient.java | 9 +- source/de/anomic/yacy/yacySearch.java | 31 +++- 8 files changed, 280 insertions(+), 159 deletions(-) diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 1951b1609..82446c522 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -47,14 +47,12 @@ // javac -classpath .:../../Classes search.java // if the shell's current path is htroot/yacy -import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import de.anomic.http.httpHeader; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchEvent; @@ -108,10 +106,7 @@ public final class search { } // prepare search - final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength); - for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) { - keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); - } + final Set keyhashes = plasmaSearchQuery.hashes2Set(query); final long timestamp = System.currentTimeMillis(); plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, filter); @@ -129,11 +124,7 @@ public final class search { // retrieve index containers from search request plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache); - Set urlselection = null; - if ((urls.length() > 0) && (urls.length() % 12 == 0)) { - for (int i = 0; i < (urls.length() / 12); i++) urlselection.add(urls.substring(i * 12, (i + 1 * 12))); - } - Map containers = theSearch.localSearchContainers(urlselection); + Map containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls)); // set statistic details of search result and find best result index set String maxcounthash = null, neardhthash = null; @@ -168,7 +159,7 @@ public final class search { indexContainer localResults = theSearch.localSearchJoin(containers.values()); int joincount = localResults.size(); prop.put("joincount", Integer.toString(joincount)); - plasmaSearchResult acc = theSearch.order(localResults); + plasmaSearchResult acc = theSearch.orderFinal(localResults); // generate compressed index for maxcounthash // this is not needed if the search is restricted to specific urls, because it is a re-search diff --git a/source/de/anomic/kelondro/kelondroMSetTools.java b/source/de/anomic/kelondro/kelondroMSetTools.java index 9d70e0f03..dc2bc4006 100644 --- a/source/de/anomic/kelondro/kelondroMSetTools.java +++ b/source/de/anomic/kelondro/kelondroMSetTools.java @@ -48,8 +48,8 @@ import java.io.FileInputStream; import java.util.Collection; import java.util.Comparator; import java.util.Iterator; -import java.util.Map; import java.util.Set; +import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; @@ -85,7 +85,7 @@ public class kelondroMSetTools { // - join by iterative tests (where we distinguish left-right and right-left tests) - public static TreeMap joinConstructive(Collection maps) { + public static TreeMap joinConstructive(Collection maps, boolean concatStrings) { // this joins all TreeMap(s) contained in maps // first order entities by their size @@ -116,7 +116,7 @@ public class kelondroMSetTools { k = (Long) orderMap.firstKey(); // the next smallest... mapA = joinResult; mapB = (TreeMap) orderMap.remove(k); - joinResult = joinConstructiveByTestSetInMap(mapB, mapA.keySet()); + joinResult = joinConstructiveByTest(mapA, mapB, concatStrings); // free resources mapA = null; mapB = null; @@ -127,72 +127,63 @@ public class kelondroMSetTools { return joinResult; } - public static TreeMap joinConstructive(TreeMap map, TreeSet set) { - // comparators must be equal - if ((map == null) || (set == null)) return null; - if (map.comparator() != set.comparator()) return null; - if ((map.size() == 0) || (set.size() == 0)) return new TreeMap(map.comparator()); - - // decide which method to use - int high = ((map.size() > set.size()) ? map.size() : set.size()); - int low = ((map.size() > set.size()) ? set.size() : map.size()); - int stepsEnum = 10 * (high + low - 1); - int stepsTest = 12 * log2a(high) * low; + public static TreeMap joinConstructive(TreeMap map1, TreeMap map2, boolean concatStrings) { + // comparators must be equal + if ((map1 == null) || (map2 == null)) return null; + if (map1.comparator() != map2.comparator()) return null; + if ((map1.size() == 0) || (map2.size() == 0)) return new TreeMap(map1.comparator()); - // start most efficient method - if (stepsEnum > stepsTest) { - if (map.size() > set.size()) return joinConstructiveByTestSetInMap(map, set); - return joinConstructiveByTestMapInSet(map, set); - } - return joinConstructiveByEnumeration(map, set); - } + // decide which method to use + int high = ((map1.size() > map2.size()) ? map1.size() : map2.size()); + int low = ((map1.size() > map2.size()) ? map2.size() : map1.size()); + int stepsEnum = 10 * (high + low - 1); + int stepsTest = 12 * log2a(high) * low; - private static TreeMap joinConstructiveByTestSetInMap(TreeMap map, Set set) { - Iterator si = set.iterator(); - TreeMap result = new TreeMap(map.comparator()); - Object o; - while (si.hasNext()) { - o = si.next(); - if (map.containsKey(o)) result.put(o, map.get(o)); - } - return result; + // start most efficient method + if (stepsEnum > stepsTest) { + if (map1.size() > map2.size()) return joinConstructiveByTest(map2, map1, concatStrings); + return joinConstructiveByTest(map1, map2, concatStrings); + } + return joinConstructiveByEnumeration(map1, map2, concatStrings); } - - private static TreeMap joinConstructiveByTestMapInSet(Map map, TreeSet set) { - Iterator mi = map.keySet().iterator(); - TreeMap result = new TreeMap(set.comparator()); - Object o; - while (mi.hasNext()) { - o = mi.next(); - if (set.contains(o)) result.put(o, map.get(o)); - } - return result; + + private static TreeMap joinConstructiveByTest(TreeMap small, TreeMap large, boolean concatStrings) { + Iterator mi = small.entrySet().iterator(); + TreeMap result = new TreeMap(large.comparator()); + Map.Entry mentry1; + Object mobj2; + while (mi.hasNext()) { + mentry1 = (Map.Entry) mi.next(); + mobj2 = large.get(mentry1.getKey()); + if (mobj2 != null) result.put(mentry1.getKey(), (concatStrings) ? ((String) mentry1.getValue() + (String) mobj2) : mentry1.getValue()); + } + return result; } - private static TreeMap joinConstructiveByEnumeration(TreeMap map, TreeSet set) { - // implement pairvise enumeration - Comparator comp = map.comparator(); - Iterator mi = map.keySet().iterator(); - Iterator si = set.iterator(); - TreeMap result = new TreeMap(map.comparator()); - int c; - if ((mi.hasNext()) && (si.hasNext())) { - Object mobj = mi.next(); - Object sobj = si.next(); - while (true) { - c = compare(mobj, sobj, comp); - if (c < 0) { - if (mi.hasNext()) mobj = mi.next(); else break; - } else if (c > 0) { - if (si.hasNext()) sobj = si.next(); else break; - } else { - result.put(mobj, map.get(mobj)); - if (mi.hasNext()) mobj = mi.next(); else break; - if (si.hasNext()) sobj = si.next(); else break; - } - } - } - return result; + private static TreeMap joinConstructiveByEnumeration(TreeMap map1, TreeMap map2, boolean concatStrings) { + // implement pairvise enumeration + Comparator comp = map1.comparator(); + Iterator mi1 = map1.entrySet().iterator(); + Iterator mi2 = map2.entrySet().iterator(); + TreeMap result = new TreeMap(map1.comparator()); + int c; + if ((mi1.hasNext()) && (mi2.hasNext())) { + Map.Entry mentry1 = (Map.Entry) mi1.next(); + Map.Entry mentry2 = (Map.Entry) mi2.next(); + while (true) { + c = compare(mentry1.getKey(), mentry2.getKey(), comp); + if (c < 0) { + if (mi1.hasNext()) mentry1 = (Map.Entry) mi1.next(); else break; + } else if (c > 0) { + if (mi2.hasNext()) mentry2 = (Map.Entry) mi2.next(); else break; + } else { + result.put(mentry1.getKey(), (concatStrings) ? ((String) mentry1.getValue() + (String) mentry2.getValue()) : mentry1.getValue()); + if (mi1.hasNext()) mentry1 = (Map.Entry) mi1.next(); else break; + if (mi2.hasNext()) mentry2 = (Map.Entry) mi2.next(); else break; + } + } + } + return result; } // now the same for set-set @@ -268,7 +259,7 @@ public class kelondroMSetTools { // return excludeConstructiveByEnumeration(map, set); } - private static TreeMap excludeConstructiveByTestMapInSet(TreeMap map, TreeSet set) { + private static TreeMap excludeConstructiveByTestMapInSet(TreeMap map, Set set) { Iterator mi = map.keySet().iterator(); TreeMap result = new TreeMap(map.comparator()); Object o; @@ -279,7 +270,8 @@ public class kelondroMSetTools { return result; } - private static TreeMap excludeConstructiveByEnumeration(TreeMap map, TreeSet set) { + /* + private static TreeMap excludeConstructiveByEnumeration(TreeMap map, TreeSet set) { // returns map without the elements in set // enumerates objects Comparator comp = map.comparator(); @@ -317,7 +309,7 @@ public class kelondroMSetTools { } return result; } - + */ public static void excludeDestructive(TreeMap map, TreeSet set) { // comparators must be equal if (map == null) return; @@ -411,7 +403,7 @@ public class kelondroMSetTools { public static void main(String[] args) { TreeMap m = new TreeMap(); - TreeSet s = new TreeSet(); + TreeMap s = new TreeMap(); m.put("a", "a"); m.put("x", "x"); m.put("f", "f"); @@ -422,26 +414,26 @@ public class kelondroMSetTools { m.put("k", "k"); m.put("y", "y"); m.put("z", "z"); - s.add("a"); - s.add("b"); - s.add("c"); - s.add("k"); - s.add("l"); - s.add("m"); - s.add("n"); - s.add("o"); - s.add("p"); - s.add("q"); - s.add("r"); - s.add("s"); - s.add("t"); - s.add("x"); + s.put("a", "a"); + s.put("b", "b"); + s.put("c", "c"); + s.put("k", "k"); + s.put("l", "l"); + s.put("m", "m"); + s.put("n", "n"); + s.put("o", "o"); + s.put("p", "p"); + s.put("q", "q"); + s.put("r", "r"); + s.put("s", "s"); + s.put("t", "t"); + s.put("x", "x"); System.out.println("Compare " + m.toString() + " with " + s.toString()); - System.out.println("Join=" + joinConstructiveByEnumeration(m, s)); - System.out.println("Join=" + joinConstructiveByTestMapInSet(m, s)); - System.out.println("Join=" + joinConstructiveByTestSetInMap(m, s)); - System.out.println("Join=" + joinConstructive(m, s)); - System.out.println("Exclude=" + excludeConstructiveByEnumeration(m, s)); + System.out.println("Join=" + joinConstructiveByEnumeration(m, s, true)); + System.out.println("Join=" + joinConstructiveByTest(m, s, true)); + System.out.println("Join=" + joinConstructiveByTest(m, s, true)); + System.out.println("Join=" + joinConstructive(m, s, true)); + System.out.println("Exclude=" + excludeConstructiveByTestMapInSet(m, s.keySet())); /* for (int low = 0; low < 10; low++) diff --git a/source/de/anomic/plasma/plasmaGrafics.java b/source/de/anomic/plasma/plasmaGrafics.java index 3340b7a5b..5c27d1f14 100644 --- a/source/de/anomic/plasma/plasmaGrafics.java +++ b/source/de/anomic/plasma/plasmaGrafics.java @@ -67,8 +67,9 @@ public class plasmaGrafics { public static ymagePainter getSearchEventPicture() { if (plasmaSearchEvent.lastEvent == null) return null; - yacySearch[] searches = plasmaSearchEvent.lastEvent.getSearchThreads(); - if (searches == null) return null; // this was a local search and there are no threads + yacySearch[] primarySearches = plasmaSearchEvent.lastEvent.getPrimarySearchThreads(); + yacySearch[] secondarySearches = plasmaSearchEvent.lastEvent.getSecondarySearchThreads(); + if (primarySearches == null) return null; // this was a local search and there are no threads // get a copy of a recent network picture ymagePainter eventPicture = getNetworkPicture(120000); @@ -82,14 +83,25 @@ public class plasmaGrafics { String hash; int angle; - // draw in the search peers - for (int j = 0; j < searches.length; j++) { - eventPicture.setColor((searches[j].isAlive()) ? ymageMatrix.ADDITIVE_RED : ymageMatrix.ADDITIVE_GREEN); - hash = searches[j].target().hash; + // draw in the primary search peers + for (int j = 0; j < primarySearches.length; j++) { + eventPicture.setColor((primarySearches[j].isAlive()) ? ymageMatrix.ADDITIVE_RED : ymageMatrix.ADDITIVE_GREEN); + hash = primarySearches[j].target().hash; angle = (int) ((long) 360 * (yacySeed.dhtPosition(hash) / (yacySeed.maxDHTDistance / (long) 10000)) / (long) 10000); eventPicture.arcLine(cx, cy, cr - 20, cr, angle); } + // draw in the secondary search peers + if (secondarySearches != null) { + for (int j = 0; j < secondarySearches.length; j++) { + eventPicture.setColor((secondarySearches[j].isAlive()) ? ymageMatrix.ADDITIVE_RED : ymageMatrix.ADDITIVE_GREEN); + hash = secondarySearches[j].target().hash; + angle = (int) ((long) 360 * (yacySeed.dhtPosition(hash) / (yacySeed.maxDHTDistance / (long) 10000)) / (long) 10000); + eventPicture.arcLine(cx, cy, cr - 10, cr, angle - 1); + eventPicture.arcLine(cx, cy, cr - 10, cr, angle + 1); + } + } + // draw in the search target plasmaSearchQuery query = plasmaSearchEvent.lastEvent.getQuery(); Iterator i = query.queryHashes.iterator(); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 8ce165d7d..ec125b2de 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -73,7 +73,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation private plasmaSearchTimingProfile profileLocal, profileGlobal; private boolean postsort; - private yacySearch[] searchThreads; + private yacySearch[] primarySearchThreads, secondarySearchThreads; public plasmaSearchEvent(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, @@ -96,7 +96,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable { this.profileLocal = localTiming; this.profileGlobal = remoteTiming; this.postsort = postsort; - this.searchThreads = null; + this.primarySearchThreads = null; + this.secondarySearchThreads = null; } public plasmaSearchQuery getQuery() { @@ -107,8 +108,11 @@ public final class plasmaSearchEvent extends Thread implements Runnable { return profileLocal; } - public yacySearch[] getSearchThreads() { - return searchThreads; + public yacySearch[] getPrimarySearchThreads() { + return primarySearchThreads; + } + public yacySearch[] getSecondarySearchThreads() { + return secondarySearchThreads; } public plasmaSearchResult search() { @@ -134,7 +138,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { long secondaryTimeout = System.currentTimeMillis() + profileGlobal.duetime() / 2; long primaryTimeout = System.currentTimeMillis() + profileGlobal.duetime(); - searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcContainers, rcAbstracts, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); + primarySearchThreads = yacySearch.primaryRemoteSearches(plasmaSearchQuery.hashSet2hashString(query.queryHashes), "", + query.prefer, query.urlMask, query.maxDistance, urlStore, rcContainers, rcAbstracts, + fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); // meanwhile do a local search Map searchContainerMap = localSearchContainers(null); @@ -144,35 +150,16 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // evaluate index abstracts and start a secondary search // this is temporary debugging code to learn that the index abstracts are fetched correctly while (System.currentTimeMillis() < secondaryTimeout + 10000) { - if (yacySearch.remainingWaiting(searchThreads) == 0) break; // all threads have finished + if (yacySearch.remainingWaiting(primarySearchThreads) == 0) break; // all threads have finished try {Thread.sleep(100);} catch (InterruptedException e) {} } - System.out.println("DEBUG-INDEXABSTRACT: " + rcAbstracts.size() + " word references catched, " + query.size() + " needed"); - /* - Iterator i = rcAbstracts.entrySet().iterator(); - Map.Entry entry; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - System.out.println("DEBUG-INDEXABSTRACT: hash " + (String) entry.getKey() + ": " + ((query.queryHashes.contains((String) entry.getKey())) ? "NEEDED" : "NOT NEEDED") + "; " + ((TreeMap) entry.getValue()).size() + " entries"); - } - */ - TreeMap abstractJoin = (rcAbstracts.size() == query.size()) ? kelondroMSetTools.joinConstructive(rcAbstracts.values()) : new TreeMap(); - if (abstractJoin.size() == 0) { - System.out.println("DEBUG-INDEXABSTRACT: no success using index abstracts from remote peers"); - } else { - System.out.println("DEBUG-INDEXABSTRACT: index abstracts delivered " + abstractJoin.size() + " additional results for secondary search"); - Iterator i = abstractJoin.entrySet().iterator(); - Map.Entry entry; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - System.out.println("DEBUG-INDEXABSTRACT: url " + (String) entry.getKey() + ": from peers " + (String) entry.getValue()); - } - } + prepareSecondarySearch(); // catch up global results: // wait until primary timeout passed while (System.currentTimeMillis() < primaryTimeout) { - if (yacySearch.remainingWaiting(searchThreads) == 0) break; // all threads have finished + if ((yacySearch.remainingWaiting(primarySearchThreads) == 0) && + ((secondarySearchThreads == null) || (yacySearch.remainingWaiting(secondarySearchThreads) == 0))) break; // all threads have finished try {Thread.sleep(100);} catch (InterruptedException e) {} } int globalContributions = rcContainers.size(); @@ -181,7 +168,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); // combine the result and order - plasmaSearchResult result = ((globalContributions == 0) && (localResult.sizeOrdered() != 0)) ? localResult : order(rcLocal); + plasmaSearchResult result = ((globalContributions == 0) && (localResult.sizeOrdered() != 0)) ? localResult : orderFinal(rcLocal); result.globalContributions = globalContributions; result.localContributions = rcLocal.size(); @@ -195,7 +182,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { } else { Map searchContainerMap = localSearchContainers(null); indexContainer rcLocal = localSearchJoin((searchContainerMap == null) ? null : searchContainerMap.values()); - plasmaSearchResult result = order(rcLocal); + plasmaSearchResult result = orderFinal(rcLocal); result.localContributions = rcLocal.size(); // return search result @@ -206,6 +193,91 @@ public final class plasmaSearchEvent extends Thread implements Runnable { } } + private void prepareSecondarySearch() { + // catch up index abstracts and join them; then call peers again to submit their urls + System.out.println("DEBUG-INDEXABSTRACT: " + rcAbstracts.size() + " word references catched, " + query.size() + " needed"); + + if (rcAbstracts.size() != query.size()) return; // secondary search not possible + + Iterator i = rcAbstracts.entrySet().iterator(); + Map.Entry entry; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + System.out.println("DEBUG-INDEXABSTRACT: hash " + (String) entry.getKey() + ": " + ((query.queryHashes.contains((String) entry.getKey())) ? "NEEDED" : "NOT NEEDED") + "; " + ((TreeMap) entry.getValue()).size() + " entries"); + } + + TreeMap abstractJoin = (rcAbstracts.size() == query.size()) ? kelondroMSetTools.joinConstructive(rcAbstracts.values(), true) : new TreeMap(); + if (abstractJoin.size() == 0) { + System.out.println("DEBUG-INDEXABSTRACT: no success using index abstracts from remote peers"); + } else { + System.out.println("DEBUG-INDEXABSTRACT: index abstracts delivered " + abstractJoin.size() + " additional results for secondary search"); + // generate query for secondary search + TreeMap secondarySearchURLs = new TreeMap(); // a (peerhash:urlhash-liststring) mapping + Iterator i1 = abstractJoin.entrySet().iterator(); + Map.Entry entry1; + String url, urls, peer, peers; + while (i1.hasNext()) { + entry1 = (Map.Entry) i1.next(); + url = (String) entry1.getKey(); + peers = (String) entry1.getValue(); + System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peers); + for (int j = 0; j < peers.length(); j = j + 12) { + peer = peers.substring(j, j + 12); + if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin + urls = (String) secondarySearchURLs.get(peer); + urls = (urls == null) ? url : urls + url; + secondarySearchURLs.put(peer, urls); + } + } + + // compute words for secondary search and start the secondary searches + i1 = secondarySearchURLs.entrySet().iterator(); + String words; + secondarySearchThreads = new yacySearch[secondarySearchURLs.size()]; + int c = 0; + while (i1.hasNext()) { + entry1 = (Map.Entry) i1.next(); + peer = (String) entry1.getKey(); + urls = (String) entry1.getValue(); + words = wordsFromPeer(peer, urls); + System.out.println("DEBUG-INDEXABSTRACT: peer " + peer + " has urls: " + urls); + System.out.println("DEBUG-INDEXABSTRACT: peer " + peer + " from words: " + words); + secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch( + words, urls, urlStore, rcContainers, peer, plasmaSwitchboard.urlBlacklist, snippetCache, + profileGlobal, ranking); + + } + } + } + + private String wordsFromPeer(String peerhash, String urls) { + Map.Entry entry; + String word, peerlist, url, wordlist = ""; + TreeMap urlPeerlist; + int p; + boolean hasURL; + synchronized (rcAbstracts) { + Iterator i = rcAbstracts.entrySet().iterator(); + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + word = (String) entry.getKey(); + urlPeerlist = (TreeMap) entry.getValue(); + hasURL = true; + for (int j = 0; j < urls.length(); j = j + 12) { + url = urls.substring(j, j + 12); + peerlist = (String) urlPeerlist.get(url); + p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash); + if ((p < 0) || (p % 12 != 0)) { + hasURL = false; + break; + } + } + if (hasURL) wordlist += word; + } + } + return wordlist; + } + public Map localSearchContainers(Set urlselection) { // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result @@ -243,7 +315,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { return rcLocal; } - public plasmaSearchResult order(indexContainer rcLocal) { + public plasmaSearchResult orderFinal(indexContainer rcLocal) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime @@ -263,6 +335,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // start url-fetch long postorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_POSTSORT); + System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime); long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime); profileLocal.startTimer(); plasmaSearchResult acc = new plasmaSearchResult(query, ranking); @@ -307,20 +380,17 @@ public final class plasmaSearchEvent extends Thread implements Runnable { return acc; } - private plasmaSearchResult orderLocal(indexContainer rcLocal, long maxtime) { + private plasmaSearchResult orderLocal(indexContainer rcLocal, long timeout) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime profileLocal.startTimer(); - if (maxtime < 0) maxtime = 200; - plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, maxtime); + plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis()); preorder.remove(true, true); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size()); // start url-fetch - maxtime = Math.max(200, maxtime - profileLocal.getYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT)); - long postorderLimitTime = System.currentTimeMillis() + maxtime; profileLocal.startTimer(); plasmaSearchResult acc = new plasmaSearchResult(query, ranking); @@ -330,7 +400,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { Object[] preorderEntry; try { while (preorder.hasNext()) { - if (System.currentTimeMillis() >= postorderLimitTime) break; + if (System.currentTimeMillis() >= timeout) break; preorderEntry = preorder.next(); entry = (indexEntry) preorderEntry[0]; preranking = (Long) preorderEntry[1]; @@ -368,15 +438,21 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // it is wise to call this within a separate thread because // this method waits until all threads are finished - int remaining; + int remaining = 0; + if (primarySearchThreads == null) return; long starttime = System.currentTimeMillis(); - while ((searchThreads != null) && ((remaining = yacySearch.remainingWaiting(searchThreads)) > 0)) { + while (true) { + remaining = yacySearch.remainingWaiting(primarySearchThreads); + if (secondarySearchThreads != null) remaining += yacySearch.remainingWaiting(secondarySearchThreads); + if (remaining == 0) break; + flushGlobalResults(); // wait a little bit before trying again - try {Thread.sleep(3000);} catch (InterruptedException e) {} + try {Thread.sleep(1000);} catch (InterruptedException e) {} if (System.currentTimeMillis() - starttime > 90000) { - yacySearch.interruptAlive(searchThreads); + yacySearch.interruptAlive(primarySearchThreads); + if (secondarySearchThreads != null) yacySearch.interruptAlive(secondarySearchThreads); log.logFine("SEARCH FLUSH: " + remaining + " PEERS STILL BUSY; ABANDONED; SEARCH WAS " + query.queryWords); break; } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 9bf829242..1e65779e9 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -42,6 +42,7 @@ package de.anomic.plasma; +import java.util.HashSet; import java.util.Set; import java.util.TreeSet; import java.util.Iterator; @@ -59,8 +60,7 @@ public final class plasmaSearchQuery { public static final int SEARCHDOM_GLOBALDHT = 3; public static final int SEARCHDOM_GLOBALALL = 4; - public Set queryWords; - public Set queryHashes; + public Set queryWords, queryHashes; public int wantedResults; public String prefer; public long maximumTime; @@ -99,12 +99,18 @@ public final class plasmaSearchQuery { this.domMaxTargets = -1; } - public static Set words2hashes(String[] words) { + public static Set words2hashSet(String[] words) { TreeSet hashes = new TreeSet(); for (int i = 0; i < words.length; i++) hashes.add(indexEntryAttribute.word2hash(words[i])); return hashes; } + public static String words2hashString(String[] words) { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < words.length; i++) sb.append(indexEntryAttribute.word2hash(words[i])); + return new String(sb); + } + public static Set words2hashes(Set words) { Iterator i = words.iterator(); TreeSet hashes = new TreeSet(); @@ -112,6 +118,22 @@ public final class plasmaSearchQuery { return hashes; } + public static Set hashes2Set(String query) { + if (query == null) return new HashSet(); + final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength); + for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) { + keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); + } + return keyhashes; + } + + public static String hashSet2hashString(Set words) { + Iterator i = words.iterator(); + StringBuffer sb = new StringBuffer(words.size() * indexEntryAttribute.wordHashLength); + while (i.hasNext()) sb.append((String) i.next()); + return new String(sb); + } + public static TreeSet cleanQuery(String words) { // convert Umlaute words = htmlFilterAbstractScraper.convertUmlaute(new serverByteBuffer(words.getBytes())).toString(); @@ -148,6 +170,7 @@ public final class plasmaSearchQuery { return result.toString(); } + /* public String hashes(String separator) { StringBuffer result = new StringBuffer(8 * queryHashes.size()); Iterator i = queryHashes.iterator(); @@ -158,7 +181,8 @@ public final class plasmaSearchQuery { } return result.toString(); } - + */ + public void filterOut(Set blueList) { // filter out words that appear in this set Iterator it = queryWords.iterator(); diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 02b6faa95..0f04ab5ec 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -209,8 +209,8 @@ public class plasmaSearchRankingProfile { } // apply query-in-result matching - Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps); - Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps); + Set urlcomph = plasmaSearchQuery.words2hashSet(urlcomps); + Set descrcomph = plasmaSearchQuery.words2hashSet(descrcomps); Iterator shi = query.queryHashes.iterator(); String queryhash; while (shi.hasNext()) { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index b33f6ee17..0c020f0ed 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -366,6 +366,7 @@ public final class yacyClient { public static int search( String wordhashes, + String urlhashes, String prefer, String filter, int maxDistance, @@ -422,6 +423,7 @@ public final class yacyClient { obj.put("count", timingProfile.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT)); obj.put("resource", ((global) ? "global" : "local")); obj.put("query", wordhashes); + obj.put("urls", urlhashes); obj.put("prefer", prefer); obj.put("filter", filter); obj.put("ttl", "0"); @@ -448,6 +450,11 @@ public final class yacyClient { ) ); + if (result.size() == 0) { + yacyCore.log.logFine("SEARCH failed FROM " + targetPeer.hash + ":" + targetPeer.getName() + ", score=" + targetPeer.selectscore + ", DHTdist=" + yacyDHTAction.dhtDistance(targetPeer.hash, wordhashes)); + return 0; + } + // compute all computation times final long totalrequesttime = System.currentTimeMillis() - timestamp; String returnProfile = (String) result.get("profile"); @@ -470,7 +477,7 @@ public final class yacyClient { // references : references (search hints) that was calculated during search // now create a plasmaIndex out of this result - //System.out.println("yacyClient: search result = " + result.toString()); // debug + System.out.println("yacyClient: " + ((urlhashes.length() == 0) ? "primary" : "secondary")+ " search result = " + result.toString()); // debug final int results = Integer.parseInt((String) result.get("count")); //System.out.println("***result count " + results); diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 32b0adfc2..b404f3a3e 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -48,10 +48,12 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import de.anomic.index.indexContainer; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSnippetCache; @@ -60,7 +62,7 @@ import de.anomic.server.logging.serverLog; public class yacySearch extends Thread { - final private Set wordhashes; + final private String wordhashes, urlhashes; final private boolean global; final private plasmaCrawlLURL urlManager; final private indexContainer containerCache; @@ -74,13 +76,14 @@ public class yacySearch extends Thread { final private plasmaSearchRankingProfile rankingProfile; final private String prefer, filter; - public yacySearch(Set wordhashes, String prefer, String filter, int maxDistance, + public yacySearch(String wordhashes, String urlhashes, String prefer, String filter, int maxDistance, boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, indexContainer containerCache, Map abstractCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { super("yacySearch_" + targetPeer.getName()); this.wordhashes = wordhashes; + this.urlhashes = urlhashes; this.prefer = prefer; this.filter = filter; this.global = global; @@ -97,7 +100,7 @@ public class yacySearch extends Thread { } public void run() { - this.links = yacyClient.search(set2string(wordhashes), prefer, filter, maxDistance, global, targetPeer, urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); + this.links = yacyClient.search(wordhashes, urlhashes, prefer, filter, maxDistance, global, targetPeer, urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); if (links != 0) { //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes); yacyCore.seedDB.mySeed.incRI(links); @@ -186,7 +189,7 @@ public class yacySearch extends Thread { return result; } - public static yacySearch[] searchHashes(Set wordhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager, + public static yacySearch[] primaryRemoteSearches(String wordhashes, String urlhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager, indexContainer containerCache, Map abstractCache, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { @@ -195,13 +198,13 @@ public class yacySearch extends Thread { // prepare seed targets and threads //Set wordhashes = plasmaSearch.words2hashes(querywords); - final yacySeed[] targetPeers = selectPeers(wordhashes, targets); + final yacySeed[] targetPeers = selectPeers(plasmaSearchQuery.hashes2Set(wordhashes), targets); if (targetPeers == null) return null; targets = targetPeers.length; if (targets == 0) return null; yacySearch[] searchThreads = new yacySearch[targets]; for (int i = 0; i < targets; i++) { - searchThreads[i]= new yacySearch(wordhashes, prefer, filter, maxDist, true, targetPeers[i], + searchThreads[i]= new yacySearch(wordhashes, urlhashes, prefer, filter, maxDist, true, targetPeers[i], urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); searchThreads[i].start(); //try {Thread.sleep(20);} catch (InterruptedException e) {} @@ -209,6 +212,22 @@ public class yacySearch extends Thread { return searchThreads; } + public static yacySearch secondaryRemoteSearch(String wordhashes, String urlhashes, plasmaCrawlLURL urlManager, indexContainer containerCache, + String targethash, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, + plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { + // check own peer status + if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; } + + // prepare seed targets and threads + //Set wordhashes = plasmaSearch.words2hashes(querywords); + final yacySeed targetPeer = yacyCore.seedDB.getConnected(targethash); + if (targetPeer == null) return null; + yacySearch searchThread = new yacySearch(wordhashes, urlhashes, "", "", 9999, true, targetPeer, + urlManager, containerCache, new TreeMap(), blacklist, snippetCache, timingProfile, rankingProfile); + searchThread.start(); + return searchThread; + } + public static int remainingWaiting(yacySearch[] searchThreads) { if (searchThreads == null) return 0; int alive = 0;