first attempt to implement a secondary search

this is a set of search processes that shall enrich search results
with specialized requests to realize a combination of search results
from different peers.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2571 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 2a06ce5538
commit cf9884e22b

@ -47,14 +47,12 @@
// javac -classpath .:../../Classes search.java // javac -classpath .:../../Classes search.java
// if the shell's current path is htroot/yacy // if the shell's current path is htroot/yacy
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchEvent;
@ -108,10 +106,7 @@ public final class search {
} }
// prepare search // prepare search
final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength); final Set keyhashes = plasmaSearchQuery.hashes2Set(query);
for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) {
keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength));
}
final long timestamp = System.currentTimeMillis(); final long timestamp = System.currentTimeMillis();
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, filter); plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, filter);
@ -129,11 +124,7 @@ public final class search {
// retrieve index containers from search request // retrieve index containers from search request
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache); plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
Set urlselection = null; Map containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls));
if ((urls.length() > 0) && (urls.length() % 12 == 0)) {
for (int i = 0; i < (urls.length() / 12); i++) urlselection.add(urls.substring(i * 12, (i + 1 * 12)));
}
Map containers = theSearch.localSearchContainers(urlselection);
// set statistic details of search result and find best result index set // set statistic details of search result and find best result index set
String maxcounthash = null, neardhthash = null; String maxcounthash = null, neardhthash = null;
@ -168,7 +159,7 @@ public final class search {
indexContainer localResults = theSearch.localSearchJoin(containers.values()); indexContainer localResults = theSearch.localSearchJoin(containers.values());
int joincount = localResults.size(); int joincount = localResults.size();
prop.put("joincount", Integer.toString(joincount)); prop.put("joincount", Integer.toString(joincount));
plasmaSearchResult acc = theSearch.order(localResults); plasmaSearchResult acc = theSearch.orderFinal(localResults);
// generate compressed index for maxcounthash // generate compressed index for maxcounthash
// this is not needed if the search is restricted to specific urls, because it is a re-search // this is not needed if the search is restricted to specific urls, because it is a re-search

@ -48,8 +48,8 @@ import java.io.FileInputStream;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
@ -85,7 +85,7 @@ public class kelondroMSetTools {
// - join by iterative tests (where we distinguish left-right and right-left tests) // - join by iterative tests (where we distinguish left-right and right-left tests)
public static TreeMap joinConstructive(Collection maps) { public static TreeMap joinConstructive(Collection maps, boolean concatStrings) {
// this joins all TreeMap(s) contained in maps // this joins all TreeMap(s) contained in maps
// first order entities by their size // first order entities by their size
@ -116,7 +116,7 @@ public class kelondroMSetTools {
k = (Long) orderMap.firstKey(); // the next smallest... k = (Long) orderMap.firstKey(); // the next smallest...
mapA = joinResult; mapA = joinResult;
mapB = (TreeMap) orderMap.remove(k); mapB = (TreeMap) orderMap.remove(k);
joinResult = joinConstructiveByTestSetInMap(mapB, mapA.keySet()); joinResult = joinConstructiveByTest(mapA, mapB, concatStrings);
// free resources // free resources
mapA = null; mapA = null;
mapB = null; mapB = null;
@ -127,68 +127,59 @@ public class kelondroMSetTools {
return joinResult; return joinResult;
} }
public static TreeMap joinConstructive(TreeMap map, TreeSet set) { public static TreeMap joinConstructive(TreeMap map1, TreeMap map2, boolean concatStrings) {
// comparators must be equal // comparators must be equal
if ((map == null) || (set == null)) return null; if ((map1 == null) || (map2 == null)) return null;
if (map.comparator() != set.comparator()) return null; if (map1.comparator() != map2.comparator()) return null;
if ((map.size() == 0) || (set.size() == 0)) return new TreeMap(map.comparator()); if ((map1.size() == 0) || (map2.size() == 0)) return new TreeMap(map1.comparator());
// decide which method to use // decide which method to use
int high = ((map.size() > set.size()) ? map.size() : set.size()); int high = ((map1.size() > map2.size()) ? map1.size() : map2.size());
int low = ((map.size() > set.size()) ? set.size() : map.size()); int low = ((map1.size() > map2.size()) ? map2.size() : map1.size());
int stepsEnum = 10 * (high + low - 1); int stepsEnum = 10 * (high + low - 1);
int stepsTest = 12 * log2a(high) * low; int stepsTest = 12 * log2a(high) * low;
// start most efficient method // start most efficient method
if (stepsEnum > stepsTest) { if (stepsEnum > stepsTest) {
if (map.size() > set.size()) return joinConstructiveByTestSetInMap(map, set); if (map1.size() > map2.size()) return joinConstructiveByTest(map2, map1, concatStrings);
return joinConstructiveByTestMapInSet(map, set); return joinConstructiveByTest(map1, map2, concatStrings);
}
return joinConstructiveByEnumeration(map, set);
}
private static TreeMap joinConstructiveByTestSetInMap(TreeMap map, Set set) {
Iterator si = set.iterator();
TreeMap result = new TreeMap(map.comparator());
Object o;
while (si.hasNext()) {
o = si.next();
if (map.containsKey(o)) result.put(o, map.get(o));
} }
return result; return joinConstructiveByEnumeration(map1, map2, concatStrings);
} }
private static TreeMap joinConstructiveByTestMapInSet(Map map, TreeSet set) { private static TreeMap joinConstructiveByTest(TreeMap small, TreeMap large, boolean concatStrings) {
Iterator mi = map.keySet().iterator(); Iterator mi = small.entrySet().iterator();
TreeMap result = new TreeMap(set.comparator()); TreeMap result = new TreeMap(large.comparator());
Object o; Map.Entry mentry1;
Object mobj2;
while (mi.hasNext()) { while (mi.hasNext()) {
o = mi.next(); mentry1 = (Map.Entry) mi.next();
if (set.contains(o)) result.put(o, map.get(o)); mobj2 = large.get(mentry1.getKey());
if (mobj2 != null) result.put(mentry1.getKey(), (concatStrings) ? ((String) mentry1.getValue() + (String) mobj2) : mentry1.getValue());
} }
return result; return result;
} }
private static TreeMap joinConstructiveByEnumeration(TreeMap map, TreeSet set) { private static TreeMap joinConstructiveByEnumeration(TreeMap map1, TreeMap map2, boolean concatStrings) {
// implement pairvise enumeration // implement pairvise enumeration
Comparator comp = map.comparator(); Comparator comp = map1.comparator();
Iterator mi = map.keySet().iterator(); Iterator mi1 = map1.entrySet().iterator();
Iterator si = set.iterator(); Iterator mi2 = map2.entrySet().iterator();
TreeMap result = new TreeMap(map.comparator()); TreeMap result = new TreeMap(map1.comparator());
int c; int c;
if ((mi.hasNext()) && (si.hasNext())) { if ((mi1.hasNext()) && (mi2.hasNext())) {
Object mobj = mi.next(); Map.Entry mentry1 = (Map.Entry) mi1.next();
Object sobj = si.next(); Map.Entry mentry2 = (Map.Entry) mi2.next();
while (true) { while (true) {
c = compare(mobj, sobj, comp); c = compare(mentry1.getKey(), mentry2.getKey(), comp);
if (c < 0) { if (c < 0) {
if (mi.hasNext()) mobj = mi.next(); else break; if (mi1.hasNext()) mentry1 = (Map.Entry) mi1.next(); else break;
} else if (c > 0) { } else if (c > 0) {
if (si.hasNext()) sobj = si.next(); else break; if (mi2.hasNext()) mentry2 = (Map.Entry) mi2.next(); else break;
} else { } else {
result.put(mobj, map.get(mobj)); result.put(mentry1.getKey(), (concatStrings) ? ((String) mentry1.getValue() + (String) mentry2.getValue()) : mentry1.getValue());
if (mi.hasNext()) mobj = mi.next(); else break; if (mi1.hasNext()) mentry1 = (Map.Entry) mi1.next(); else break;
if (si.hasNext()) sobj = si.next(); else break; if (mi2.hasNext()) mentry2 = (Map.Entry) mi2.next(); else break;
} }
} }
} }
@ -268,7 +259,7 @@ public class kelondroMSetTools {
// return excludeConstructiveByEnumeration(map, set); // return excludeConstructiveByEnumeration(map, set);
} }
private static TreeMap excludeConstructiveByTestMapInSet(TreeMap map, TreeSet set) { private static TreeMap excludeConstructiveByTestMapInSet(TreeMap map, Set set) {
Iterator mi = map.keySet().iterator(); Iterator mi = map.keySet().iterator();
TreeMap result = new TreeMap(map.comparator()); TreeMap result = new TreeMap(map.comparator());
Object o; Object o;
@ -279,6 +270,7 @@ public class kelondroMSetTools {
return result; return result;
} }
/*
private static TreeMap excludeConstructiveByEnumeration(TreeMap map, TreeSet set) { private static TreeMap excludeConstructiveByEnumeration(TreeMap map, TreeSet set) {
// returns map without the elements in set // returns map without the elements in set
// enumerates objects // enumerates objects
@ -317,7 +309,7 @@ public class kelondroMSetTools {
} }
return result; return result;
} }
*/
public static void excludeDestructive(TreeMap map, TreeSet set) { public static void excludeDestructive(TreeMap map, TreeSet set) {
// comparators must be equal // comparators must be equal
if (map == null) return; if (map == null) return;
@ -411,7 +403,7 @@ public class kelondroMSetTools {
public static void main(String[] args) { public static void main(String[] args) {
TreeMap m = new TreeMap(); TreeMap m = new TreeMap();
TreeSet s = new TreeSet(); TreeMap s = new TreeMap();
m.put("a", "a"); m.put("a", "a");
m.put("x", "x"); m.put("x", "x");
m.put("f", "f"); m.put("f", "f");
@ -422,26 +414,26 @@ public class kelondroMSetTools {
m.put("k", "k"); m.put("k", "k");
m.put("y", "y"); m.put("y", "y");
m.put("z", "z"); m.put("z", "z");
s.add("a"); s.put("a", "a");
s.add("b"); s.put("b", "b");
s.add("c"); s.put("c", "c");
s.add("k"); s.put("k", "k");
s.add("l"); s.put("l", "l");
s.add("m"); s.put("m", "m");
s.add("n"); s.put("n", "n");
s.add("o"); s.put("o", "o");
s.add("p"); s.put("p", "p");
s.add("q"); s.put("q", "q");
s.add("r"); s.put("r", "r");
s.add("s"); s.put("s", "s");
s.add("t"); s.put("t", "t");
s.add("x"); s.put("x", "x");
System.out.println("Compare " + m.toString() + " with " + s.toString()); System.out.println("Compare " + m.toString() + " with " + s.toString());
System.out.println("Join=" + joinConstructiveByEnumeration(m, s)); System.out.println("Join=" + joinConstructiveByEnumeration(m, s, true));
System.out.println("Join=" + joinConstructiveByTestMapInSet(m, s)); System.out.println("Join=" + joinConstructiveByTest(m, s, true));
System.out.println("Join=" + joinConstructiveByTestSetInMap(m, s)); System.out.println("Join=" + joinConstructiveByTest(m, s, true));
System.out.println("Join=" + joinConstructive(m, s)); System.out.println("Join=" + joinConstructive(m, s, true));
System.out.println("Exclude=" + excludeConstructiveByEnumeration(m, s)); System.out.println("Exclude=" + excludeConstructiveByTestMapInSet(m, s.keySet()));
/* /*
for (int low = 0; low < 10; low++) for (int low = 0; low < 10; low++)

@ -67,8 +67,9 @@ public class plasmaGrafics {
public static ymagePainter getSearchEventPicture() { public static ymagePainter getSearchEventPicture() {
if (plasmaSearchEvent.lastEvent == null) return null; if (plasmaSearchEvent.lastEvent == null) return null;
yacySearch[] searches = plasmaSearchEvent.lastEvent.getSearchThreads(); yacySearch[] primarySearches = plasmaSearchEvent.lastEvent.getPrimarySearchThreads();
if (searches == null) return null; // this was a local search and there are no threads yacySearch[] secondarySearches = plasmaSearchEvent.lastEvent.getSecondarySearchThreads();
if (primarySearches == null) return null; // this was a local search and there are no threads
// get a copy of a recent network picture // get a copy of a recent network picture
ymagePainter eventPicture = getNetworkPicture(120000); ymagePainter eventPicture = getNetworkPicture(120000);
@ -82,14 +83,25 @@ public class plasmaGrafics {
String hash; String hash;
int angle; int angle;
// draw in the search peers // draw in the primary search peers
for (int j = 0; j < searches.length; j++) { for (int j = 0; j < primarySearches.length; j++) {
eventPicture.setColor((searches[j].isAlive()) ? ymageMatrix.ADDITIVE_RED : ymageMatrix.ADDITIVE_GREEN); eventPicture.setColor((primarySearches[j].isAlive()) ? ymageMatrix.ADDITIVE_RED : ymageMatrix.ADDITIVE_GREEN);
hash = searches[j].target().hash; hash = primarySearches[j].target().hash;
angle = (int) ((long) 360 * (yacySeed.dhtPosition(hash) / (yacySeed.maxDHTDistance / (long) 10000)) / (long) 10000); angle = (int) ((long) 360 * (yacySeed.dhtPosition(hash) / (yacySeed.maxDHTDistance / (long) 10000)) / (long) 10000);
eventPicture.arcLine(cx, cy, cr - 20, cr, angle); eventPicture.arcLine(cx, cy, cr - 20, cr, angle);
} }
// draw in the secondary search peers
if (secondarySearches != null) {
for (int j = 0; j < secondarySearches.length; j++) {
eventPicture.setColor((secondarySearches[j].isAlive()) ? ymageMatrix.ADDITIVE_RED : ymageMatrix.ADDITIVE_GREEN);
hash = secondarySearches[j].target().hash;
angle = (int) ((long) 360 * (yacySeed.dhtPosition(hash) / (yacySeed.maxDHTDistance / (long) 10000)) / (long) 10000);
eventPicture.arcLine(cx, cy, cr - 10, cr, angle - 1);
eventPicture.arcLine(cx, cy, cr - 10, cr, angle + 1);
}
}
// draw in the search target // draw in the search target
plasmaSearchQuery query = plasmaSearchEvent.lastEvent.getQuery(); plasmaSearchQuery query = plasmaSearchEvent.lastEvent.getQuery();
Iterator i = query.queryHashes.iterator(); Iterator i = query.queryHashes.iterator();

@ -73,7 +73,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
private plasmaSearchTimingProfile profileLocal, profileGlobal; private plasmaSearchTimingProfile profileLocal, profileGlobal;
private boolean postsort; private boolean postsort;
private yacySearch[] searchThreads; private yacySearch[] primarySearchThreads, secondarySearchThreads;
public plasmaSearchEvent(plasmaSearchQuery query, public plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking, plasmaSearchRankingProfile ranking,
@ -96,7 +96,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
this.profileLocal = localTiming; this.profileLocal = localTiming;
this.profileGlobal = remoteTiming; this.profileGlobal = remoteTiming;
this.postsort = postsort; this.postsort = postsort;
this.searchThreads = null; this.primarySearchThreads = null;
this.secondarySearchThreads = null;
} }
public plasmaSearchQuery getQuery() { public plasmaSearchQuery getQuery() {
@ -107,8 +108,11 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
return profileLocal; return profileLocal;
} }
public yacySearch[] getSearchThreads() { public yacySearch[] getPrimarySearchThreads() {
return searchThreads; return primarySearchThreads;
}
public yacySearch[] getSecondarySearchThreads() {
return secondarySearchThreads;
} }
public plasmaSearchResult search() { public plasmaSearchResult search() {
@ -134,7 +138,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
long secondaryTimeout = System.currentTimeMillis() + profileGlobal.duetime() / 2; long secondaryTimeout = System.currentTimeMillis() + profileGlobal.duetime() / 2;
long primaryTimeout = System.currentTimeMillis() + profileGlobal.duetime(); long primaryTimeout = System.currentTimeMillis() + profileGlobal.duetime();
searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcContainers, rcAbstracts, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); primarySearchThreads = yacySearch.primaryRemoteSearches(plasmaSearchQuery.hashSet2hashString(query.queryHashes), "",
query.prefer, query.urlMask, query.maxDistance, urlStore, rcContainers, rcAbstracts,
fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);
// meanwhile do a local search // meanwhile do a local search
Map searchContainerMap = localSearchContainers(null); Map searchContainerMap = localSearchContainers(null);
@ -144,35 +150,16 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// evaluate index abstracts and start a secondary search // evaluate index abstracts and start a secondary search
// this is temporary debugging code to learn that the index abstracts are fetched correctly // this is temporary debugging code to learn that the index abstracts are fetched correctly
while (System.currentTimeMillis() < secondaryTimeout + 10000) { while (System.currentTimeMillis() < secondaryTimeout + 10000) {
if (yacySearch.remainingWaiting(searchThreads) == 0) break; // all threads have finished if (yacySearch.remainingWaiting(primarySearchThreads) == 0) break; // all threads have finished
try {Thread.sleep(100);} catch (InterruptedException e) {} try {Thread.sleep(100);} catch (InterruptedException e) {}
} }
System.out.println("DEBUG-INDEXABSTRACT: " + rcAbstracts.size() + " word references catched, " + query.size() + " needed"); prepareSecondarySearch();
/*
Iterator i = rcAbstracts.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
System.out.println("DEBUG-INDEXABSTRACT: hash " + (String) entry.getKey() + ": " + ((query.queryHashes.contains((String) entry.getKey())) ? "NEEDED" : "NOT NEEDED") + "; " + ((TreeMap) entry.getValue()).size() + " entries");
}
*/
TreeMap abstractJoin = (rcAbstracts.size() == query.size()) ? kelondroMSetTools.joinConstructive(rcAbstracts.values()) : new TreeMap();
if (abstractJoin.size() == 0) {
System.out.println("DEBUG-INDEXABSTRACT: no success using index abstracts from remote peers");
} else {
System.out.println("DEBUG-INDEXABSTRACT: index abstracts delivered " + abstractJoin.size() + " additional results for secondary search");
Iterator i = abstractJoin.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
System.out.println("DEBUG-INDEXABSTRACT: url " + (String) entry.getKey() + ": from peers " + (String) entry.getValue());
}
}
// catch up global results: // catch up global results:
// wait until primary timeout passed // wait until primary timeout passed
while (System.currentTimeMillis() < primaryTimeout) { while (System.currentTimeMillis() < primaryTimeout) {
if (yacySearch.remainingWaiting(searchThreads) == 0) break; // all threads have finished if ((yacySearch.remainingWaiting(primarySearchThreads) == 0) &&
((secondarySearchThreads == null) || (yacySearch.remainingWaiting(secondarySearchThreads) == 0))) break; // all threads have finished
try {Thread.sleep(100);} catch (InterruptedException e) {} try {Thread.sleep(100);} catch (InterruptedException e) {}
} }
int globalContributions = rcContainers.size(); int globalContributions = rcContainers.size();
@ -181,7 +168,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
// combine the result and order // combine the result and order
plasmaSearchResult result = ((globalContributions == 0) && (localResult.sizeOrdered() != 0)) ? localResult : order(rcLocal); plasmaSearchResult result = ((globalContributions == 0) && (localResult.sizeOrdered() != 0)) ? localResult : orderFinal(rcLocal);
result.globalContributions = globalContributions; result.globalContributions = globalContributions;
result.localContributions = rcLocal.size(); result.localContributions = rcLocal.size();
@ -195,7 +182,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
} else { } else {
Map searchContainerMap = localSearchContainers(null); Map searchContainerMap = localSearchContainers(null);
indexContainer rcLocal = localSearchJoin((searchContainerMap == null) ? null : searchContainerMap.values()); indexContainer rcLocal = localSearchJoin((searchContainerMap == null) ? null : searchContainerMap.values());
plasmaSearchResult result = order(rcLocal); plasmaSearchResult result = orderFinal(rcLocal);
result.localContributions = rcLocal.size(); result.localContributions = rcLocal.size();
// return search result // return search result
@ -206,6 +193,91 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
} }
} }
private void prepareSecondarySearch() {
// catch up index abstracts and join them; then call peers again to submit their urls
System.out.println("DEBUG-INDEXABSTRACT: " + rcAbstracts.size() + " word references catched, " + query.size() + " needed");
if (rcAbstracts.size() != query.size()) return; // secondary search not possible
Iterator i = rcAbstracts.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
System.out.println("DEBUG-INDEXABSTRACT: hash " + (String) entry.getKey() + ": " + ((query.queryHashes.contains((String) entry.getKey())) ? "NEEDED" : "NOT NEEDED") + "; " + ((TreeMap) entry.getValue()).size() + " entries");
}
TreeMap abstractJoin = (rcAbstracts.size() == query.size()) ? kelondroMSetTools.joinConstructive(rcAbstracts.values(), true) : new TreeMap();
if (abstractJoin.size() == 0) {
System.out.println("DEBUG-INDEXABSTRACT: no success using index abstracts from remote peers");
} else {
System.out.println("DEBUG-INDEXABSTRACT: index abstracts delivered " + abstractJoin.size() + " additional results for secondary search");
// generate query for secondary search
TreeMap secondarySearchURLs = new TreeMap(); // a (peerhash:urlhash-liststring) mapping
Iterator i1 = abstractJoin.entrySet().iterator();
Map.Entry entry1;
String url, urls, peer, peers;
while (i1.hasNext()) {
entry1 = (Map.Entry) i1.next();
url = (String) entry1.getKey();
peers = (String) entry1.getValue();
System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peers);
for (int j = 0; j < peers.length(); j = j + 12) {
peer = peers.substring(j, j + 12);
if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin
urls = (String) secondarySearchURLs.get(peer);
urls = (urls == null) ? url : urls + url;
secondarySearchURLs.put(peer, urls);
}
}
// compute words for secondary search and start the secondary searches
i1 = secondarySearchURLs.entrySet().iterator();
String words;
secondarySearchThreads = new yacySearch[secondarySearchURLs.size()];
int c = 0;
while (i1.hasNext()) {
entry1 = (Map.Entry) i1.next();
peer = (String) entry1.getKey();
urls = (String) entry1.getValue();
words = wordsFromPeer(peer, urls);
System.out.println("DEBUG-INDEXABSTRACT: peer " + peer + " has urls: " + urls);
System.out.println("DEBUG-INDEXABSTRACT: peer " + peer + " from words: " + words);
secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch(
words, urls, urlStore, rcContainers, peer, plasmaSwitchboard.urlBlacklist, snippetCache,
profileGlobal, ranking);
}
}
}
private String wordsFromPeer(String peerhash, String urls) {
Map.Entry entry;
String word, peerlist, url, wordlist = "";
TreeMap urlPeerlist;
int p;
boolean hasURL;
synchronized (rcAbstracts) {
Iterator i = rcAbstracts.entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
word = (String) entry.getKey();
urlPeerlist = (TreeMap) entry.getValue();
hasURL = true;
for (int j = 0; j < urls.length(); j = j + 12) {
url = urls.substring(j, j + 12);
peerlist = (String) urlPeerlist.get(url);
p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash);
if ((p < 0) || (p % 12 != 0)) {
hasURL = false;
break;
}
}
if (hasURL) wordlist += word;
}
}
return wordlist;
}
public Map localSearchContainers(Set urlselection) { public Map localSearchContainers(Set urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
@ -243,7 +315,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
return rcLocal; return rcLocal;
} }
public plasmaSearchResult order(indexContainer rcLocal) { public plasmaSearchResult orderFinal(indexContainer rcLocal) {
// we collect the urlhashes and construct a list with urlEntry objects // we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime // attention: if minEntries is too high, this method will not terminate within the maxTime
@ -263,6 +335,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// start url-fetch // start url-fetch
long postorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_POSTSORT); long postorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_POSTSORT);
System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime);
long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime); long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime);
profileLocal.startTimer(); profileLocal.startTimer();
plasmaSearchResult acc = new plasmaSearchResult(query, ranking); plasmaSearchResult acc = new plasmaSearchResult(query, ranking);
@ -307,20 +380,17 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
return acc; return acc;
} }
private plasmaSearchResult orderLocal(indexContainer rcLocal, long maxtime) { private plasmaSearchResult orderLocal(indexContainer rcLocal, long timeout) {
// we collect the urlhashes and construct a list with urlEntry objects // we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime // attention: if minEntries is too high, this method will not terminate within the maxTime
profileLocal.startTimer(); profileLocal.startTimer();
if (maxtime < 0) maxtime = 200; plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis());
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, maxtime);
preorder.remove(true, true); preorder.remove(true, true);
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size()); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size());
// start url-fetch // start url-fetch
maxtime = Math.max(200, maxtime - profileLocal.getYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT));
long postorderLimitTime = System.currentTimeMillis() + maxtime;
profileLocal.startTimer(); profileLocal.startTimer();
plasmaSearchResult acc = new plasmaSearchResult(query, ranking); plasmaSearchResult acc = new plasmaSearchResult(query, ranking);
@ -330,7 +400,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
Object[] preorderEntry; Object[] preorderEntry;
try { try {
while (preorder.hasNext()) { while (preorder.hasNext()) {
if (System.currentTimeMillis() >= postorderLimitTime) break; if (System.currentTimeMillis() >= timeout) break;
preorderEntry = preorder.next(); preorderEntry = preorder.next();
entry = (indexEntry) preorderEntry[0]; entry = (indexEntry) preorderEntry[0];
preranking = (Long) preorderEntry[1]; preranking = (Long) preorderEntry[1];
@ -368,15 +438,21 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// it is wise to call this within a separate thread because // it is wise to call this within a separate thread because
// this method waits until all threads are finished // this method waits until all threads are finished
int remaining; int remaining = 0;
if (primarySearchThreads == null) return;
long starttime = System.currentTimeMillis(); long starttime = System.currentTimeMillis();
while ((searchThreads != null) && ((remaining = yacySearch.remainingWaiting(searchThreads)) > 0)) { while (true) {
remaining = yacySearch.remainingWaiting(primarySearchThreads);
if (secondarySearchThreads != null) remaining += yacySearch.remainingWaiting(secondarySearchThreads);
if (remaining == 0) break;
flushGlobalResults(); flushGlobalResults();
// wait a little bit before trying again // wait a little bit before trying again
try {Thread.sleep(3000);} catch (InterruptedException e) {} try {Thread.sleep(1000);} catch (InterruptedException e) {}
if (System.currentTimeMillis() - starttime > 90000) { if (System.currentTimeMillis() - starttime > 90000) {
yacySearch.interruptAlive(searchThreads); yacySearch.interruptAlive(primarySearchThreads);
if (secondarySearchThreads != null) yacySearch.interruptAlive(secondarySearchThreads);
log.logFine("SEARCH FLUSH: " + remaining + " PEERS STILL BUSY; ABANDONED; SEARCH WAS " + query.queryWords); log.logFine("SEARCH FLUSH: " + remaining + " PEERS STILL BUSY; ABANDONED; SEARCH WAS " + query.queryWords);
break; break;
} }

@ -42,6 +42,7 @@
package de.anomic.plasma; package de.anomic.plasma;
import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.Iterator; import java.util.Iterator;
@ -59,8 +60,7 @@ public final class plasmaSearchQuery {
public static final int SEARCHDOM_GLOBALDHT = 3; public static final int SEARCHDOM_GLOBALDHT = 3;
public static final int SEARCHDOM_GLOBALALL = 4; public static final int SEARCHDOM_GLOBALALL = 4;
public Set queryWords; public Set queryWords, queryHashes;
public Set queryHashes;
public int wantedResults; public int wantedResults;
public String prefer; public String prefer;
public long maximumTime; public long maximumTime;
@ -99,12 +99,18 @@ public final class plasmaSearchQuery {
this.domMaxTargets = -1; this.domMaxTargets = -1;
} }
public static Set words2hashes(String[] words) { public static Set words2hashSet(String[] words) {
TreeSet hashes = new TreeSet(); TreeSet hashes = new TreeSet();
for (int i = 0; i < words.length; i++) hashes.add(indexEntryAttribute.word2hash(words[i])); for (int i = 0; i < words.length; i++) hashes.add(indexEntryAttribute.word2hash(words[i]));
return hashes; return hashes;
} }
public static String words2hashString(String[] words) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < words.length; i++) sb.append(indexEntryAttribute.word2hash(words[i]));
return new String(sb);
}
public static Set words2hashes(Set words) { public static Set words2hashes(Set words) {
Iterator i = words.iterator(); Iterator i = words.iterator();
TreeSet hashes = new TreeSet(); TreeSet hashes = new TreeSet();
@ -112,6 +118,22 @@ public final class plasmaSearchQuery {
return hashes; return hashes;
} }
public static Set hashes2Set(String query) {
if (query == null) return new HashSet();
final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength);
for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) {
keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength));
}
return keyhashes;
}
public static String hashSet2hashString(Set words) {
Iterator i = words.iterator();
StringBuffer sb = new StringBuffer(words.size() * indexEntryAttribute.wordHashLength);
while (i.hasNext()) sb.append((String) i.next());
return new String(sb);
}
public static TreeSet cleanQuery(String words) { public static TreeSet cleanQuery(String words) {
// convert Umlaute // convert Umlaute
words = htmlFilterAbstractScraper.convertUmlaute(new serverByteBuffer(words.getBytes())).toString(); words = htmlFilterAbstractScraper.convertUmlaute(new serverByteBuffer(words.getBytes())).toString();
@ -148,6 +170,7 @@ public final class plasmaSearchQuery {
return result.toString(); return result.toString();
} }
/*
public String hashes(String separator) { public String hashes(String separator) {
StringBuffer result = new StringBuffer(8 * queryHashes.size()); StringBuffer result = new StringBuffer(8 * queryHashes.size());
Iterator i = queryHashes.iterator(); Iterator i = queryHashes.iterator();
@ -158,6 +181,7 @@ public final class plasmaSearchQuery {
} }
return result.toString(); return result.toString();
} }
*/
public void filterOut(Set blueList) { public void filterOut(Set blueList) {
// filter out words that appear in this set // filter out words that appear in this set

@ -209,8 +209,8 @@ public class plasmaSearchRankingProfile {
} }
// apply query-in-result matching // apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps); Set urlcomph = plasmaSearchQuery.words2hashSet(urlcomps);
Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps); Set descrcomph = plasmaSearchQuery.words2hashSet(descrcomps);
Iterator shi = query.queryHashes.iterator(); Iterator shi = query.queryHashes.iterator();
String queryhash; String queryhash;
while (shi.hasNext()) { while (shi.hasNext()) {

@ -366,6 +366,7 @@ public final class yacyClient {
public static int search( public static int search(
String wordhashes, String wordhashes,
String urlhashes,
String prefer, String prefer,
String filter, String filter,
int maxDistance, int maxDistance,
@ -422,6 +423,7 @@ public final class yacyClient {
obj.put("count", timingProfile.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT)); obj.put("count", timingProfile.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT));
obj.put("resource", ((global) ? "global" : "local")); obj.put("resource", ((global) ? "global" : "local"));
obj.put("query", wordhashes); obj.put("query", wordhashes);
obj.put("urls", urlhashes);
obj.put("prefer", prefer); obj.put("prefer", prefer);
obj.put("filter", filter); obj.put("filter", filter);
obj.put("ttl", "0"); obj.put("ttl", "0");
@ -448,6 +450,11 @@ public final class yacyClient {
) )
); );
if (result.size() == 0) {
yacyCore.log.logFine("SEARCH failed FROM " + targetPeer.hash + ":" + targetPeer.getName() + ", score=" + targetPeer.selectscore + ", DHTdist=" + yacyDHTAction.dhtDistance(targetPeer.hash, wordhashes));
return 0;
}
// compute all computation times // compute all computation times
final long totalrequesttime = System.currentTimeMillis() - timestamp; final long totalrequesttime = System.currentTimeMillis() - timestamp;
String returnProfile = (String) result.get("profile"); String returnProfile = (String) result.get("profile");
@ -470,7 +477,7 @@ public final class yacyClient {
// references : references (search hints) that was calculated during search // references : references (search hints) that was calculated during search
// now create a plasmaIndex out of this result // now create a plasmaIndex out of this result
//System.out.println("yacyClient: search result = " + result.toString()); // debug System.out.println("yacyClient: " + ((urlhashes.length() == 0) ? "primary" : "secondary")+ " search result = " + result.toString()); // debug
final int results = Integer.parseInt((String) result.get("count")); final int results = Integer.parseInt((String) result.get("count"));
//System.out.println("***result count " + results); //System.out.println("***result count " + results);

@ -48,10 +48,12 @@ import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.TreeMap;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSnippetCache;
@ -60,7 +62,7 @@ import de.anomic.server.logging.serverLog;
public class yacySearch extends Thread { public class yacySearch extends Thread {
final private Set wordhashes; final private String wordhashes, urlhashes;
final private boolean global; final private boolean global;
final private plasmaCrawlLURL urlManager; final private plasmaCrawlLURL urlManager;
final private indexContainer containerCache; final private indexContainer containerCache;
@ -74,13 +76,14 @@ public class yacySearch extends Thread {
final private plasmaSearchRankingProfile rankingProfile; final private plasmaSearchRankingProfile rankingProfile;
final private String prefer, filter; final private String prefer, filter;
public yacySearch(Set wordhashes, String prefer, String filter, int maxDistance, public yacySearch(String wordhashes, String urlhashes, String prefer, String filter, int maxDistance,
boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager,
indexContainer containerCache, Map abstractCache, indexContainer containerCache, Map abstractCache,
plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
super("yacySearch_" + targetPeer.getName()); super("yacySearch_" + targetPeer.getName());
this.wordhashes = wordhashes; this.wordhashes = wordhashes;
this.urlhashes = urlhashes;
this.prefer = prefer; this.prefer = prefer;
this.filter = filter; this.filter = filter;
this.global = global; this.global = global;
@ -97,7 +100,7 @@ public class yacySearch extends Thread {
} }
public void run() { public void run() {
this.links = yacyClient.search(set2string(wordhashes), prefer, filter, maxDistance, global, targetPeer, urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); this.links = yacyClient.search(wordhashes, urlhashes, prefer, filter, maxDistance, global, targetPeer, urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile);
if (links != 0) { if (links != 0) {
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes); //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes);
yacyCore.seedDB.mySeed.incRI(links); yacyCore.seedDB.mySeed.incRI(links);
@ -186,7 +189,7 @@ public class yacySearch extends Thread {
return result; return result;
} }
public static yacySearch[] searchHashes(Set wordhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager, public static yacySearch[] primaryRemoteSearches(String wordhashes, String urlhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager,
indexContainer containerCache, Map abstractCache, indexContainer containerCache, Map abstractCache,
int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
@ -195,13 +198,13 @@ public class yacySearch extends Thread {
// prepare seed targets and threads // prepare seed targets and threads
//Set wordhashes = plasmaSearch.words2hashes(querywords); //Set wordhashes = plasmaSearch.words2hashes(querywords);
final yacySeed[] targetPeers = selectPeers(wordhashes, targets); final yacySeed[] targetPeers = selectPeers(plasmaSearchQuery.hashes2Set(wordhashes), targets);
if (targetPeers == null) return null; if (targetPeers == null) return null;
targets = targetPeers.length; targets = targetPeers.length;
if (targets == 0) return null; if (targets == 0) return null;
yacySearch[] searchThreads = new yacySearch[targets]; yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) { for (int i = 0; i < targets; i++) {
searchThreads[i]= new yacySearch(wordhashes, prefer, filter, maxDist, true, targetPeers[i], searchThreads[i]= new yacySearch(wordhashes, urlhashes, prefer, filter, maxDist, true, targetPeers[i],
urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile);
searchThreads[i].start(); searchThreads[i].start();
//try {Thread.sleep(20);} catch (InterruptedException e) {} //try {Thread.sleep(20);} catch (InterruptedException e) {}
@ -209,6 +212,22 @@ public class yacySearch extends Thread {
return searchThreads; return searchThreads;
} }
public static yacySearch secondaryRemoteSearch(String wordhashes, String urlhashes, plasmaCrawlLURL urlManager, indexContainer containerCache,
String targethash, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
// check own peer status
if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; }
// prepare seed targets and threads
//Set wordhashes = plasmaSearch.words2hashes(querywords);
final yacySeed targetPeer = yacyCore.seedDB.getConnected(targethash);
if (targetPeer == null) return null;
yacySearch searchThread = new yacySearch(wordhashes, urlhashes, "", "", 9999, true, targetPeer,
urlManager, containerCache, new TreeMap(), blacklist, snippetCache, timingProfile, rankingProfile);
searchThread.start();
return searchThread;
}
public static int remainingWaiting(yacySearch[] searchThreads) { public static int remainingWaiting(yacySearch[] searchThreads) {
if (searchThreads == null) return 0; if (searchThreads == null) return 0;
int alive = 0; int alive = 0;

Loading…
Cancel
Save