diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index c59f1449e..9b1a9c5ff 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -70,8 +70,7 @@ public final class SearchEvent { private RankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container private ResultFetcher results; - // class variables for search abstracts - private final IndexAbstracts rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation + private final SecondarySearchSuperviser secondarySearchSuperviser; // class variables for remote searches private yacySearch[] primarySearchThreads, secondarySearchThreads; @@ -93,7 +92,8 @@ public final class SearchEvent { this.peers = peers; this.crawlResults = crawlResults; this.query = query; - this.rcAbstracts = (query.queryHashes.size() > 1) ? new IndexAbstracts() : null; // generate abstracts only for combined searches + this.secondarySearchSuperviser = (query.queryHashes.size() > 1) ? new SecondarySearchSuperviser() : null; // generate abstracts only for combined searches + //if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start(); this.primarySearchThreads = null; this.secondarySearchThreads = null; this.preselectedPeerHashes = preselectedPeerHashes; @@ -134,7 +134,7 @@ public final class SearchEvent { peers, crawlResults, rankedCache, - rcAbstracts, + secondarySearchSuperviser, fetchpeers, Switchboard.urlBlacklist, query.ranking, @@ -331,25 +331,91 @@ public final class SearchEvent { boolean secondarySearchStartet = false; - void prepareSecondarySearch() { - if (secondarySearchStartet) return; // don't do this twice + public class SecondarySearchSuperviser extends Thread { - if ((rcAbstracts == null) || (rcAbstracts.size() != query.queryHashes.size())) return; // secondary search not possible (yet) - this.secondarySearchStartet = true; + // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation + // this relation contains the information where specific urls can be found in specific peers + TreeMap> abstractsCache; - /* - // catch up index abstracts and join them; then call peers again to submit their urls - System.out.println("DEBUG-INDEXABSTRACT: " + rcAbstracts.size() + " word references caught, " + query.queryHashes.size() + " needed"); - - Iterator i = rcAbstracts.entrySet().iterator(); - Map.Entry entry; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - System.out.println("DEBUG-INDEXABSTRACT: hash " + (String) entry.getKey() + ": " + ((query.queryHashes.contains((String) entry.getKey())) ? "NEEDED" : "NOT NEEDED") + "; " + ((TreeMap) entry.getValue()).size() + " entries"); + public SecondarySearchSuperviser() { + this.abstractsCache = new TreeMap>(); } + + /** + * add a single abstract to the existing set of abstracts + * @param wordhash + * @param singleAbstract // a mapping from url-hashes to a string of peer-hashes */ - final TreeMap abstractJoin = (rcAbstracts.size() == query.queryHashes.size()) ? SetTools.joinConstructive(rcAbstracts.values(), true) : new TreeMap(); - if (!abstractJoin.isEmpty()) { + public void addAbstract(String wordhash, TreeMap singleAbstract) { + synchronized (abstractsCache) { + TreeMap oldAbstract = abstractsCache.get(wordhash); + if (oldAbstract == null) { + // new abstracts in the cache + abstractsCache.put(wordhash, singleAbstract); + } else { + // extend the abstracts in the cache: join the single abstracts + for (Map.Entry oneref: singleAbstract.entrySet()) { + String urlhash = oneref.getKey(); + String peerlistNew = oneref.getValue(); + String peerlistOld = oldAbstract.get(urlhash); + if (peerlistOld == null) { + oldAbstract.put(urlhash, peerlistNew); + } else { + oldAbstract.put(urlhash, peerlistOld + peerlistNew); + } + } + // abstractsCache.put(wordhash, oldAbstract); + } + } + } + + private String wordsFromPeer(final String peerhash, final String urls) { + Map.Entry> entry; + String word, peerlist, url, wordlist = ""; + TreeMap urlPeerlist; + int p; + boolean hasURL; + synchronized (this) { + final Iterator>> i = this.abstractsCache.entrySet().iterator(); + while (i.hasNext()) { + entry = i.next(); + word = entry.getKey(); + urlPeerlist = entry.getValue(); + hasURL = true; + for (int j = 0; j < urls.length(); j = j + 12) { + url = urls.substring(j, j + 12); + peerlist = urlPeerlist.get(url); + p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash); + if ((p < 0) || (p % 12 != 0)) { + hasURL = false; + break; + } + } + if (hasURL) wordlist += word; + } + } + return wordlist; + } + + public void run() { + try {Thread.sleep(5000);} catch (InterruptedException e) {} + prepareSecondarySearch(); + } + + private void prepareSecondarySearch() { + + if (abstractsCache == null || abstractsCache.size() != query.queryHashes.size()) return; // secondary search not possible (yet) + + + // catch up index abstracts and join them; then call peers again to submit their urls + System.out.println("DEBUG-INDEXABSTRACT: " + abstractsCache.size() + " word references caught, " + query.queryHashes.size() + " needed"); + for (Map.Entry> entry: abstractsCache.entrySet()) { + System.out.println("DEBUG-INDEXABSTRACT: hash " + entry.getKey() + ": " + ((query.queryHashes.has(entry.getKey().getBytes()) ? "NEEDED" : "NOT NEEDED") + "; " + entry.getValue().size() + " entries")); + } + + final TreeMap abstractJoin = (abstractsCache.size() == query.queryHashes.size()) ? SetTools.joinConstructive(abstractsCache.values(), true) : new TreeMap(); + if (abstractJoin.isEmpty()) return; + //System.out.println("DEBUG-INDEXABSTRACT: index abstracts delivered " + abstractJoin.size() + " additional results for secondary search"); // generate query for secondary search final TreeMap secondarySearchURLs = new TreeMap(); // a (peerhash:urlhash-liststring) mapping @@ -363,7 +429,7 @@ public final class SearchEvent { entry1 = i1.next(); url = entry1.getKey(); ps = entry1.getValue(); - //System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peers); + System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peers); mypeercount = 0; for (int j = 0; j < ps.length(); j = j + 12) { peer = ps.substring(j, j + 12); @@ -386,20 +452,20 @@ public final class SearchEvent { peer = entry1.getKey(); if (peer.equals(mypeerhash)) continue; // we dont need to ask ourself urls = entry1.getValue(); - words = rcAbstracts.wordsFromPeer(peer, urls); + words = wordsFromPeer(peer, urls); assert words.length() >= 12 : "words = " + words; - //System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls); - //System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words); + System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls); + System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words); secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch( - words, "", urls, this.query.getSegment(), peers, crawlResults, this.rankedCache, peer, Switchboard.urlBlacklist, + words, "", urls, query.getSegment(), peers, crawlResults, rankedCache, peer, Switchboard.urlBlacklist, query.ranking, query.constraint, preselectedPeerHashes); - } - //} else { - //System.out.println("DEBUG-INDEXABSTRACT: no success using index abstracts from remote peers"); + } + } + public void remove(final WordReferenceVars reference) { this.rankedCache.remove(reference); } diff --git a/source/de/anomic/server/serverCore.java b/source/de/anomic/server/serverCore.java index 4e00b3ebf..c3fbe608c 100644 --- a/source/de/anomic/server/serverCore.java +++ b/source/de/anomic/server/serverCore.java @@ -701,7 +701,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread { else reqProtocol = null; if (this.request == null) break; - if (reqProtocol.equals("HTTP")) { + if (reqProtocol != null && reqProtocol.equals("HTTP")) { this.commandObj = handlerPrototype.clone(); } } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 70b46b367..b24d9bfe6 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -88,6 +88,7 @@ import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.search.RankingProfile; import de.anomic.search.RankingProcess; +import de.anomic.search.SearchEvent; import de.anomic.search.Segment; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; @@ -386,7 +387,7 @@ public final class yacyClient { final Segment indexSegment, final ResultURLs crawlResults, final RankingProcess containerCache, - final Map> abstractCache, + final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser, final Blacklist blacklist, final RankingProfile rankingProfile, final Bitfield constraint @@ -426,7 +427,7 @@ public final class yacyClient { post.add(new DefaultCharsetStringPart("maxdist", Integer.toString(maxDistance))); post.add(new DefaultCharsetStringPart("profile", crypt.simpleEncode(rankingProfile.toExternalString()))); post.add(new DefaultCharsetStringPart("constraint", (constraint == null) ? "" : constraint.exportB64())); - if (abstractCache != null) post.add(new DefaultCharsetStringPart("abstracts", "auto")); + if (secondarySearchSuperviser != null) post.add(new DefaultCharsetStringPart("abstracts", "auto")); final long timestamp = System.currentTimeMillis(); // send request @@ -579,32 +580,31 @@ public final class yacyClient { } // read index abstract - if (abstractCache != null) { + if (secondarySearchSuperviser != null) { final Iterator> i = result.entrySet().iterator(); Map.Entry entry; - TreeMap singleAbstract; String wordhash; + String whacc = ""; ByteBuffer ci; - while (i.hasNext()) { + int ac = 0; + abstractparser: while (i.hasNext()) { entry = i.next(); if (entry.getKey().startsWith("indexabstract.")) { wordhash = entry.getKey().substring(14); - synchronized (abstractCache) { - singleAbstract = abstractCache.get(wordhash); // a mapping from url-hashes to a string of peer-hashes - if (singleAbstract == null) singleAbstract = new TreeMap(); - try { - ci = new ByteBuffer(entry.getValue().getBytes("UTF-8")); - } catch (UnsupportedEncodingException e) { - Log.logException(e); - return -1; - } - //System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString()); - ReferenceContainer.decompressIndex(singleAbstract, ci, target.hash); - abstractCache.put(wordhash, singleAbstract); + if (wordhash.charAt(0) == '[') break abstractparser; + whacc += wordhash; + try { + ci = new ByteBuffer(entry.getValue().getBytes("UTF-8")); + } catch (UnsupportedEncodingException e) { + Log.logException(e); + return -1; } + //System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString()); + secondarySearchSuperviser.addAbstract(wordhash, ReferenceContainer.decompressIndex(ci, target.hash)); + ac++; } } - if (abstractCache.size() > 0) yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + abstractCache.size() + " index abstracts"); + if (ac > 0) yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + ac + " index abstracts for words "+ whacc); } // generate statistics diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 19d10ca96..dade258c3 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -41,6 +41,7 @@ import de.anomic.crawler.ResultURLs; import de.anomic.search.QueryParams; import de.anomic.search.RankingProfile; import de.anomic.search.RankingProcess; +import de.anomic.search.SearchEvent; import de.anomic.search.Segment; import de.anomic.yacy.dht.PeerSelection; @@ -53,7 +54,7 @@ public class yacySearch extends Thread { final private int partitions; final private Segment indexSegment; final private RankingProcess containerCache; - final private Map> abstractCache; + final private SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser; final private Blacklist blacklist; final private yacySeed targetPeer; private int urls; @@ -79,7 +80,7 @@ public class yacySearch extends Thread { final yacySeedDB peers, final ResultURLs crawlResults, final RankingProcess containerCache, - final Map> abstractCache, + final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser, final Blacklist blacklist, final RankingProfile rankingProfile, final Bitfield constraint) { @@ -100,7 +101,7 @@ public class yacySearch extends Thread { this.peers = peers; this.crawlResults = crawlResults; this.containerCache = containerCache; - this.abstractCache = abstractCache; + this.secondarySearchSuperviser = secondarySearchSuperviser; this.blacklist = blacklist; this.targetPeer = targetPeer; this.urls = -1; @@ -117,7 +118,7 @@ public class yacySearch extends Thread { wordhashes, excludehashes, urlhashes, prefer, filter, language, sitehash, authorhash, count, maxDistance, global, partitions, - targetPeer, indexSegment, crawlResults, containerCache, abstractCache, + targetPeer, indexSegment, crawlResults, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint); if (urls >= 0) { // urls is an array of url hashes. this is only used for log output @@ -260,7 +261,7 @@ public class yacySearch extends Thread { final yacySeedDB peers, final ResultURLs crawlResults, final RankingProcess containerCache, - final Map> abstractCache, + final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser, int targets, final Blacklist blacklist, final RankingProfile rankingProfile, @@ -290,7 +291,7 @@ public class yacySearch extends Thread { wordhashes, excludehashes, urlhashes, prefer, filter, language, sitehash, authorhash, count, maxDist, true, targets, targetPeers[i], - indexSegment, peers, crawlResults, containerCache, abstractCache, blacklist, rankingProfile, constraint); + indexSegment, peers, crawlResults, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint); searchThreads[i].start(); } return searchThreads; @@ -316,7 +317,7 @@ public class yacySearch extends Thread { if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(targetPeer.hash.getBytes())); final yacySearch searchThread = new yacySearch( wordhashes, excludehashes, urlhashes, Pattern.compile(""), Pattern.compile(".*"), "", "", "", 0, 9999, true, 0, targetPeer, - indexSegment, peers, crawlResults, containerCache, new TreeMap>(), blacklist, rankingProfile, constraint); + indexSegment, peers, crawlResults, containerCache, null, blacklist, rankingProfile, constraint); searchThread.start(); return searchThread; } diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainer.java b/source/net/yacy/kelondro/rwi/ReferenceContainer.java index 5d5c81479..7f3a0442b 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainer.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainer.java @@ -554,7 +554,8 @@ public class ReferenceContainer extends RowSet return bb; } - public static final void decompressIndex(final TreeMap target, ByteBuffer ci, final String peerhash) { + public static final TreeMap decompressIndex(ByteBuffer ci, final String peerhash) { + TreeMap target = new TreeMap(); // target is a mapping from url-hashes to a string of peer-hashes if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) { //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString()); @@ -579,5 +580,7 @@ public class ReferenceContainer extends RowSet if (ci.byteAt(0) == ',') ci.trim(1); } } + return target; } + }