From 82a6054275ca9a37ff7de5ce92f73140b04f08c5 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 11 Sep 2006 10:39:25 +0000 Subject: [PATCH] - fixed bug with new indexAbstract generation - added partly evaluation of indexAbstracts during remote searches git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2544 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/search.java | 5 +- source/de/anomic/index/indexContainer.java | 2 - .../de/anomic/index/indexRowSetContainer.java | 39 ----------- source/de/anomic/index/indexURL.java | 66 ++++++++++++++++++- .../de/anomic/plasma/plasmaSearchEvent.java | 37 ++++++----- .../de/anomic/plasma/plasmaSearchQuery.java | 1 + source/de/anomic/server/serverByteBuffer.java | 23 ++++--- source/de/anomic/yacy/yacyClient.java | 23 ++++++- source/de/anomic/yacy/yacySearch.java | 16 +++-- 9 files changed, 135 insertions(+), 77 deletions(-) diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 01bca65f4..fec0346c4 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -55,6 +55,7 @@ import java.util.Set; import de.anomic.http.httpHeader; import de.anomic.index.indexContainer; import de.anomic.index.indexEntryAttribute; +import de.anomic.index.indexURL; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchRankingProfile; @@ -158,8 +159,8 @@ public final class search { if ((maxcounthash == null) || (urls.length() != 0)) { prop.put("indexabstract",""); } else { - String indexabstract = "indexabstract." + maxcounthash + "=" + ((indexContainer) containers.get(maxcounthash)).compressedIndex(1000); - yacyCore.log.logFine("DEBUG HASH SEARCH: " + indexabstract); + String indexabstract = "indexabstract." + maxcounthash + "=" + indexURL.compressIndex(((indexContainer) containers.get(maxcounthash)), 1000).toString(); + //yacyCore.log.logFine("DEBUG HASH SEARCH: " + indexabstract); prop.put("indexabstract", indexabstract); } diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java index 4e47a9efe..3665c9c99 100644 --- a/source/de/anomic/index/indexContainer.java +++ b/source/de/anomic/index/indexContainer.java @@ -32,7 +32,6 @@ import java.util.Iterator; import java.util.Set; import de.anomic.kelondro.kelondroOrder; -import de.anomic.server.serverByteBuffer; public interface indexContainer { @@ -44,7 +43,6 @@ public interface indexContainer { public void setWordHash(String newWordHash); public String getWordHash(); - public serverByteBuffer compressedIndex(long maxtime); public void select(Set urlselection); public void setOrdering(kelondroOrder newOrder, int newColumn); diff --git a/source/de/anomic/index/indexRowSetContainer.java b/source/de/anomic/index/indexRowSetContainer.java index 194beb5c4..d74bb6262 100644 --- a/source/de/anomic/index/indexRowSetContainer.java +++ b/source/de/anomic/index/indexRowSetContainer.java @@ -31,7 +31,6 @@ import java.util.Collection; import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.Set; -import java.util.Map; import java.util.TreeMap; import de.anomic.kelondro.kelondroBase64Order; @@ -39,7 +38,6 @@ import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroOrder; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRowSet; -import de.anomic.server.serverByteBuffer; public class indexRowSetContainer extends kelondroRowSet implements indexContainer { @@ -67,43 +65,6 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain return newContainer; } - public serverByteBuffer compressedIndex(long maxtime) { - // collect references according to domains - long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; - TreeMap doms = new TreeMap(); - synchronized(this) { - Iterator i = entries(); - indexEntry iEntry; - String dom, paths; - while (i.hasNext()) { - iEntry = (indexEntry) i.next(); - dom = iEntry.urlHash().substring(6); - if ((paths = (String) doms.get(dom)) == null) { - doms.put(dom, iEntry.urlHash().substring(0, 6)); - } else { - doms.put(dom, paths + iEntry.urlHash().substring(0, 6)); - } - if (System.currentTimeMillis() > timeout) break; - } - } - // construct a result string - serverByteBuffer bb = new serverByteBuffer(this.size() * indexURLEntry.urlEntryRow.width(0) / 2); - bb.append('{'); - Iterator i = doms.entrySet().iterator(); - Map.Entry entry; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - bb.append((String) entry.getKey()); - bb.append(':'); - bb.append((String) entry.getValue()); - if (System.currentTimeMillis() > timeout) break; - if (i.hasNext()) bb.append(','); - } - bb.append('}'); - bb.trim(); - return bb; - } - public void setWordHash(String newWordHash) { this.wordHash = newWordHash; } diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index 375cfec5f..947d527a0 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -33,12 +33,15 @@ import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroIndex; import de.anomic.kelondro.kelondroRAMIndex; import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroRow; +import de.anomic.server.serverByteBuffer; import de.anomic.server.serverCodings; import de.anomic.yacy.yacySeedDB; @@ -586,7 +589,7 @@ public class indexURL { private static String[] testTLDs = new String[] {"com", "net", "org", "uk", "fr", "de", "es", "it"}; public static final URL probablyWordURL(String urlHash, String word) { - if (word == null) return null; + if ((word == null) || (word.length() == 0)) return null; String pattern = urlHash.substring(6, 11); for (int i = 0; i < testTLDs.length; i++) { if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80))) @@ -635,4 +638,65 @@ public class indexURL { return hash; } + + public static final serverByteBuffer compressIndex(indexContainer container, long maxtime) { + // collect references according to domains + long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; + TreeMap doms = new TreeMap(); + synchronized(container) { + Iterator i = container.entries(); + indexEntry iEntry; + String dom, paths; + while (i.hasNext()) { + iEntry = (indexEntry) i.next(); + dom = iEntry.urlHash().substring(6); + if ((paths = (String) doms.get(dom)) == null) { + doms.put(dom, iEntry.urlHash().substring(0, 6)); + } else { + doms.put(dom, paths + iEntry.urlHash().substring(0, 6)); + } + if (System.currentTimeMillis() > timeout) break; + } + } + // construct a result string + serverByteBuffer bb = new serverByteBuffer(container.size() * 6); + bb.append('{'); + Iterator i = doms.entrySet().iterator(); + Map.Entry entry; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + bb.append((String) entry.getKey()); + bb.append(':'); + bb.append((String) entry.getValue()); + if (System.currentTimeMillis() > timeout) break; + if (i.hasNext()) bb.append(','); + } + bb.append('}'); + bb.trim(); + return bb; + } + + public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) { + // target is a mapping from url-hashes to a string of peer-hashes + if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) { + ci = ci.trim(1, ci.length() - 1); + String dom, url, peers; + while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) { + dom = ci.toString(0, 6); + ci.trim(7); + while ((ci.length() == 6) || ((ci.length() > 6) && (ci.byteAt(6) != ','))) { + url = ci.toString(0, 6) + dom; + ci.trim(6); + peers = (String) target.get(url); + if (peers == null) { + target.put(url, peerhash); + } else { + target.put(url, peers + peerhash); + } + } + if (ci.byteAt(0) == ',') ci.trim(1); + } + } + } + } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 384c54e32..3670a1eaf 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -47,6 +47,7 @@ import java.util.Iterator; import java.util.Map; import java.util.HashSet; import java.util.Set; +import java.util.TreeMap; import de.anomic.kelondro.kelondroException; import de.anomic.server.logging.serverLog; @@ -67,8 +68,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { private plasmaWordIndex wordIndex; private plasmaCrawlLURL urlStore; private plasmaSnippetCache snippetCache; - private indexContainer rcGlobal; // cache for results - private int rcGlobalCount; + private indexContainer rcContainers; // cache for results + private int rcContainerCount; + private Map rcAbstracts; // cache for index abstracts private plasmaSearchTimingProfile profileLocal, profileGlobal; private boolean postsort; private yacySearch[] searchThreads; @@ -88,8 +90,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { this.ranking = ranking; this.urlStore = urlStore; this.snippetCache = snippetCache; - this.rcGlobal = new indexRowSetContainer(null); - this.rcGlobalCount = 0; + this.rcContainers = new indexRowSetContainer(null); + this.rcContainerCount = 0; + this.rcAbstracts = new TreeMap(); this.profileLocal = localTiming; this.profileGlobal = remoteTiming; this.postsort = postsort; @@ -130,7 +133,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS"); long timeout = System.currentTimeMillis() + profileGlobal.duetime(); - searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); + searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcContainers, rcAbstracts, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); // meanwhile do a local search indexContainer rcLocal = localSearchJoin(localSearchContainers(null).values()); @@ -145,7 +148,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // wait a little time .. try {Thread.sleep(100);} catch (InterruptedException e) {} } - int globalContributions = rcGlobal.size(); + int globalContributions = rcContainers.size(); // finished searching log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); @@ -222,7 +225,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { profileLocal.startTimer(); long pst = System.currentTimeMillis(); searchResult.add(rcLocal, preorderTime); - searchResult.add(rcGlobal, preorderTime); + searchResult.add(rcContainers, preorderTime); preorderTime = preorderTime - (System.currentTimeMillis() - pst); if (preorderTime < 0) preorderTime = 200; plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, searchResult, preorderTime); @@ -352,10 +355,10 @@ public final class plasmaSearchEvent extends Thread implements Runnable { //log.logFine("FINISHED FLUSH RESULTS PROCESS for query " + query.hashes(",")); } - serverLog.logFine("PLASMA", "FINISHED FLUSHING " + rcGlobalCount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords); + serverLog.logFine("PLASMA", "FINISHED FLUSHING " + rcContainerCount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords); // finally delete the temporary index - rcGlobal = null; + rcContainers = null; flushThreads.remove(this); } @@ -364,24 +367,24 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // flush the rcGlobal as much as is there so far // this must be called sometime after search results had been computed int count = 0; - if ((rcGlobal != null) && (rcGlobal.size() > 0)) { - synchronized (rcGlobal) { + if ((rcContainers != null) && (rcContainers.size() > 0)) { + synchronized (rcContainers) { String wordHash; Iterator hashi = query.queryHashes.iterator(); boolean dhtCache = false; while (hashi.hasNext()) { wordHash = (String) hashi.next(); - rcGlobal.setWordHash(wordHash); + rcContainers.setWordHash(wordHash); dhtCache = dhtCache | wordIndex.busyCacheFlush; - wordIndex.addEntries(rcGlobal, System.currentTimeMillis(), dhtCache); - log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries to " + ((dhtCache) ? "DHT cache" : "word cache")); + wordIndex.addEntries(rcContainers, System.currentTimeMillis(), dhtCache); + log.logFine("FLUSHED " + wordHash + ": " + rcContainers.size() + " url entries to " + ((dhtCache) ? "DHT cache" : "word cache")); } // the rcGlobal was flushed, empty it - count += rcGlobal.size(); - rcGlobal.clear(); + count += rcContainers.size(); + rcContainers.clear(); } } - rcGlobalCount += count; + rcContainerCount += count; } } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index d6077c2c2..9bf829242 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -137,6 +137,7 @@ public final class plasmaSearchQuery { } public String words(String separator) { + if (queryWords == null) return ""; StringBuffer result = new StringBuffer(8 * queryWords.size()); Iterator i = queryWords.iterator(); if (i.hasNext()) result.append((String) i.next()); diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java index 6db3f2c4f..64a17898b 100644 --- a/source/de/anomic/server/serverByteBuffer.java +++ b/source/de/anomic/server/serverByteBuffer.java @@ -274,16 +274,15 @@ public final class serverByteBuffer extends OutputStream { return tmp; } - /* - private serverByteBuffer trim(int start) { - if (start > length) throw new IndexOutOfBoundsException("trim: start > length"); - offset = offset + start; - length = length - start; - return this; - } - */ - - private serverByteBuffer trim(int start, int end) { + public serverByteBuffer trim(int start) { + // the end value is outside (+1) of the wanted target array + if (start > length) throw new IndexOutOfBoundsException("trim: start > length"); + offset = offset + start; + length = length - start; + return this; + } + + public serverByteBuffer trim(int start, int end) { // the end value is outside (+1) of the wanted target array if (start > length) throw new IndexOutOfBoundsException("trim: start > length"); if (end > length) throw new IndexOutOfBoundsException("trim: end > length"); @@ -347,6 +346,10 @@ public final class serverByteBuffer extends OutputStream { return new String(buffer, offset, length); } + public String toString(int left, int rightbound) { + return new String(buffer, offset + left, rightbound - left); + } + public Properties propParser() { // extract a=b or a="b" - relations from the buffer int pos = offset; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index d301612ad..1fa7ab752 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -48,6 +48,8 @@ import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; +import java.util.TreeMap; +import java.util.Map; import java.util.Iterator; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -56,6 +58,7 @@ import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexRowSetContainer; +import de.anomic.index.indexURL; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.net.URL; @@ -65,6 +68,7 @@ import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.urlPattern.plasmaURLPattern; +import de.anomic.server.serverByteBuffer; import de.anomic.server.serverCodings; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; @@ -370,6 +374,7 @@ public final class yacyClient { yacySeed targetPeer, plasmaCrawlLURL urlManager, indexContainer containerCache, + Map abstractCache, plasmaURLPattern blacklist, plasmaSnippetCache snippets, plasmaSearchTimingProfile timingProfile, @@ -524,9 +529,25 @@ public final class yacyClient { } } - // finally insert the containers to the index + // insert the containers to the index for (int m = 0; m < words; m++) { containerCache.add(container[m], -1); } + // read index abstract + Iterator i = result.entrySet().iterator(); + Map.Entry entry; + TreeMap singleAbstract; + String wordhash; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + if (((String) entry.getKey()).startsWith("indexabstract.")) { + wordhash = ((String) entry.getKey()).substring(14); + singleAbstract = (TreeMap) abstractCache.get(wordhash); + if (singleAbstract == null) singleAbstract = new TreeMap(); + indexURL.decompressIndex(singleAbstract, new serverByteBuffer(((String) entry.getValue()).getBytes()), targetPeer.hash); + abstractCache.put(wordhash, singleAbstract); + } + } + // generate statistics long searchtime; try { diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 08fbd905b..32b0adfc2 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -46,6 +46,7 @@ package de.anomic.yacy; import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; +import java.util.Map; import java.util.Set; import de.anomic.index.indexContainer; @@ -63,6 +64,7 @@ public class yacySearch extends Thread { final private boolean global; final private plasmaCrawlLURL urlManager; final private indexContainer containerCache; + final private Map abstractCache; final private plasmaURLPattern blacklist; final private plasmaSnippetCache snippetCache; final private yacySeed targetPeer; @@ -72,8 +74,10 @@ public class yacySearch extends Thread { final private plasmaSearchRankingProfile rankingProfile; final private String prefer, filter; - public yacySearch(Set wordhashes, String prefer, String filter, int maxDistance, boolean global, yacySeed targetPeer, - plasmaCrawlLURL urlManager, indexContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, + public yacySearch(Set wordhashes, String prefer, String filter, int maxDistance, + boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, + indexContainer containerCache, Map abstractCache, + plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { super("yacySearch_" + targetPeer.getName()); this.wordhashes = wordhashes; @@ -82,6 +86,7 @@ public class yacySearch extends Thread { this.global = global; this.urlManager = urlManager; this.containerCache = containerCache; + this.abstractCache = abstractCache; this.blacklist = blacklist; this.snippetCache = snippetCache; this.targetPeer = targetPeer; @@ -92,7 +97,7 @@ public class yacySearch extends Thread { } public void run() { - this.links = yacyClient.search(set2string(wordhashes), prefer, filter, maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile); + this.links = yacyClient.search(set2string(wordhashes), prefer, filter, maxDistance, global, targetPeer, urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); if (links != 0) { //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes); yacyCore.seedDB.mySeed.incRI(links); @@ -181,7 +186,8 @@ public class yacySearch extends Thread { return result; } - public static yacySearch[] searchHashes(Set wordhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager, indexContainer containerCache, + public static yacySearch[] searchHashes(Set wordhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager, + indexContainer containerCache, Map abstractCache, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { // check own peer status @@ -196,7 +202,7 @@ public class yacySearch extends Thread { yacySearch[] searchThreads = new yacySearch[targets]; for (int i = 0; i < targets; i++) { searchThreads[i]= new yacySearch(wordhashes, prefer, filter, maxDist, true, targetPeers[i], - urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile); + urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); searchThreads[i].start(); //try {Thread.sleep(20);} catch (InterruptedException e) {} }