diff --git a/htroot/IndexControlRWIs_p.html b/htroot/IndexControlRWIs_p.html index 565b72c87..25149cefc 100644 --- a/htroot/IndexControlRWIs_p.html +++ b/htroot/IndexControlRWIs_p.html @@ -30,16 +30,16 @@

No entry for word '#[word]#'

::

No entry for word hash #[wordhash]#

::

Search result: -

+ - + - + @@ -53,7 +53,7 @@ - + @@ -69,18 +69,18 @@ - - - - - - - - - - - - + + + + + + + + + + + +
   total URLs appearance in in link type document type
   reference description authorindex of
   #[allurl]# #[reference]# #[description]#
Selection

diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index cd7320736..11accd815 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -35,12 +35,16 @@ #(genUrlProfile)# ::No entry found for URL-hash #[urlhash]# :: - + - + #(referrer)# + + :: + + #(/referrer)# diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index d92076d38..5fdd7013f 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -171,13 +171,7 @@ public class IndexControlURLs_p { return prop; } indexURLEntry.Components comp = entry.comp(); - String referrer = null; indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0); - if (le == null) { - referrer = ""; - } else { - referrer = le.comp().url().toNormalform(false, true); - } if (comp.url() == null) { prop.put("genUrlProfile", "1"); prop.put("genUrlProfile_urlhash", urlhash); @@ -189,7 +183,9 @@ public class IndexControlURLs_p { prop.put("genUrlProfile_urlDescr", comp.title()); prop.put("genUrlProfile_moddate", entry.moddate()); prop.put("genUrlProfile_loaddate", entry.loaddate()); - prop.putHTML("genUrlProfile_referrer", referrer); + prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1); + prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "" : le.comp().url().toNormalform(false, true)); + prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : le.hash()); prop.put("genUrlProfile_doctype", ""+entry.doctype()); prop.put("genUrlProfile_language", entry.language()); prop.put("genUrlProfile_size", entry.size()); diff --git a/htroot/PerformanceSearch_p.java b/htroot/PerformanceSearch_p.java index 349dc1888..d747746ca 100644 --- a/htroot/PerformanceSearch_p.java +++ b/htroot/PerformanceSearch_p.java @@ -24,13 +24,12 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - import java.util.Iterator; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSearchEvent; -import de.anomic.plasma.plasmaSearchProcessing; import de.anomic.server.serverObjects; +import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; public class PerformanceSearch_p { @@ -47,9 +46,9 @@ public class PerformanceSearch_p { Iterator events = se.getProcess().events(); int c = 0; - plasmaSearchProcessing.Entry event; + serverProfiling.Entry event; while (events.hasNext()) { - event = (plasmaSearchProcessing.Entry) events.next(); + event = (serverProfiling.Entry) events.next(); prop.put("table_" + c + "_event", event.process); prop.putNum("table_" + c + "_count", event.count); prop.putNum("table_" + c + "_time", event.time); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 0ddf35061..e3820db35 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -43,10 +43,10 @@ import de.anomic.net.natLib; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; -import de.anomic.plasma.plasmaSearchProcessing; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; +import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNetwork; @@ -128,7 +128,7 @@ public final class search { int indexabstractContainercount = 0; int joincount = 0; plasmaSearchQuery theQuery = null; - plasmaSearchProcessing localProcess = null; + serverProfiling localProcess = null; ArrayList accu = null; long urlRetrievalAllTime = 0, snippetComputationAllTime = 0; if ((query.length() == 0) && (abstractSet != null)) { @@ -138,10 +138,12 @@ public final class search { yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); // prepare a search profile - localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults()); + localProcess = new serverProfiling(theQuery.maximumTime, theQuery.displayResults()); //theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, sb.wordIndex, null); - Map[] containers = localProcess.localSearchContainers(theQuery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls)); + localProcess.startTimer(); + Map[] containers = sb.wordIndex.localSearchContainers(theQuery, plasmaSearchQuery.hashes2Set(urls)); + localProcess.yield(plasmaSearchEvent.COLLECTION, containers[0].size()); if (containers != null) { Iterator ci = containers[0].entrySet().iterator(); Map.Entry entry; @@ -151,7 +153,7 @@ public final class search { wordhash = (String) entry.getKey(); indexContainer container = (indexContainer) entry.getValue(); indexabstractContainercount += container.size(); - indexabstract.append("indexabstract." + wordhash + "=").append(plasmaSearchProcessing.compressIndex(container, null, 1000).toString()).append(serverCore.crlfString); + indexabstract.append("indexabstract." + wordhash + "=").append(indexContainer.compressIndex(container, null, 1000).toString()).append(serverCore.crlfString); } } @@ -168,7 +170,7 @@ public final class search { // prepare a search profile plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile); - localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults()); + localProcess = new serverProfiling(theQuery.maximumTime, theQuery.displayResults()); plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, localProcess, sb.wordIndex, null, true, abstractSet); urlRetrievalAllTime = theSearch.getURLRetrievalTime(); snippetComputationAllTime = theSearch.getSnippetComputationTime(); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index a78d1cdd0..71c111d96 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -59,12 +59,12 @@ import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; -import de.anomic.plasma.plasmaSearchProcessing; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; import de.anomic.server.serverDate; import de.anomic.server.serverObjects; +import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; import de.anomic.server.logging.serverLog; import de.anomic.tools.yFormatter; @@ -268,7 +268,7 @@ public class yacysearch { 20, constraint, false); - plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * theQuery.maximumTime / 10, theQuery.displayResults()); + serverProfiling localTiming = new serverProfiling(4 * theQuery.maximumTime / 10, theQuery.displayResults()); String client = (String) header.get("CLIENTIP"); // the search client who initiated the search diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java index df8f30263..43e906124 100644 --- a/source/de/anomic/index/indexContainer.java +++ b/source/de/anomic/index/indexContainer.java @@ -30,12 +30,15 @@ import java.lang.reflect.Method; import java.util.Collection; import java.util.ConcurrentModificationException; import java.util.Iterator; +import java.util.Map; import java.util.Set; import java.util.TreeMap; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRowSet; +import de.anomic.plasma.plasmaWordIndex; +import de.anomic.server.serverByteBuffer; public class indexContainer extends kelondroRowSet { @@ -206,6 +209,23 @@ public class indexContainer extends kelondroRowSet { } } + public static indexContainer joinExcludeContainers( + Collection includeContainers, + Collection excludeContainers, + int maxDistance) { + // join a search result and return the joincount (number of pages after join) + + // since this is a conjunction we return an empty entity if any word is not known + if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0); + + // join the result + indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance); + if (rcLocal == null) return plasmaWordIndex.emptyContainer(null, 0); + excludeContainers(rcLocal, excludeContainers); + + return rcLocal; + } + public static indexContainer joinContainers(Collection containers, int maxDistance) { // order entities by their size @@ -433,4 +453,71 @@ public class indexContainer extends kelondroRowSet { return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4)); } + + public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) { + // collect references according to domains + long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; + TreeMap doms = new TreeMap(); + synchronized (inputContainer) { + Iterator i = inputContainer.entries(); + indexRWIEntry iEntry; + String dom, paths; + while (i.hasNext()) { + iEntry = (indexRWIEntry) i.next(); + if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer + dom = iEntry.urlHash().substring(6); + if ((paths = (String) doms.get(dom)) == null) { + doms.put(dom, iEntry.urlHash().substring(0, 6)); + } else { + doms.put(dom, paths + iEntry.urlHash().substring(0, 6)); + } + if (System.currentTimeMillis() > timeout) + break; + } + } + // construct a result string + serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6); + bb.append('{'); + Iterator i = doms.entrySet().iterator(); + Map.Entry entry; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + bb.append((String) entry.getKey()); + bb.append(':'); + bb.append((String) entry.getValue()); + if (System.currentTimeMillis() > timeout) + break; + if (i.hasNext()) + bb.append(','); + } + bb.append('}'); + return bb; + } + + public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) { + // target is a mapping from url-hashes to a string of peer-hashes + if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) { + //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString()); + ci = ci.trim(1, ci.length() - 2); + String dom, url, peers; + while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) { + assert ci.length() >= 6 : "ci.length() = " + ci.length(); + dom = ci.toString(0, 6); + ci.trim(7); + while ((ci.length() > 0) && (ci.byteAt(0) != ',')) { + assert ci.length() >= 6 : "ci.length() = " + ci.length(); + url = ci.toString(0, 6) + dom; + ci.trim(6); + peers = (String) target.get(url); + if (peers == null) { + target.put(url, peerhash); + } else { + target.put(url, peers + peerhash); + } + //System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url)); + } + if (ci.byteAt(0) == ',') ci.trim(1); + } + } + } } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index a05ed6383..7de4a6a55 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -41,6 +41,7 @@ import de.anomic.index.indexRWIEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMSetTools; +import de.anomic.server.serverProfiling; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyDHTAction; @@ -50,7 +51,13 @@ import de.anomic.yacy.yacyURL; public final class plasmaSearchEvent { - public static int workerThreadCount = 10; + public static final String COLLECTION = "collection"; + public static final String JOIN = "join"; + public static final String PRESORT = "presort"; + public static final String URLFETCH = "urlfetch"; + public static final String NORMALIZING = "normalizing"; + + public static int workerThreadCount = 3; public static String lastEventID = ""; private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes @@ -62,7 +69,7 @@ public final class plasmaSearchEvent { private plasmaWordIndex wordIndex; private plasmaSearchRankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation - private plasmaSearchProcessing process; + private serverProfiling process; private yacySearch[] primarySearchThreads, secondarySearchThreads; private Thread localSearchThread; private TreeMap preselectedPeerHashes; @@ -80,7 +87,7 @@ public final class plasmaSearchEvent { private plasmaSearchEvent(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, - plasmaSearchProcessing localTiming, + serverProfiling localTiming, plasmaWordIndex wordIndex, TreeMap preselectedPeerHashes, boolean generateAbstracts, @@ -117,13 +124,13 @@ public final class plasmaSearchEvent { long start = System.currentTimeMillis(); if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) || (query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) { + // do a global search this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation); int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds if (fetchpeers > 50) fetchpeers = 50; if (fetchpeers < 30) fetchpeers = 30; - // do a global search // the result of the fetch is then in the rcGlobal process.startTimer(); serverLog.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs"); @@ -152,7 +159,10 @@ public final class plasmaSearchEvent { // finished searching serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); } else { - Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null); + // do a local search + process.startTimer(); + Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null); + process.yield(COLLECTION, searchContainerMaps[0].size()); if (generateAbstracts) { // compute index abstracts @@ -178,18 +188,21 @@ public final class plasmaSearchEvent { IAneardhthash = wordhash; } IACount.put(wordhash, new Integer(container.size())); - IAResults.put(wordhash, plasmaSearchProcessing.compressIndex(container, null, 1000).toString()); + IAResults.put(wordhash, indexContainer.compressIndex(container, null, 1000).toString()); } process.yield("abstract generation", searchContainerMaps[0].size()); } + process.startTimer(); indexContainer rcLocal = (searchContainerMaps == null) ? plasmaWordIndex.emptyContainer(null, 0) : - process.localSearchJoinExclude( + indexContainer.joinExcludeContainers( searchContainerMaps[0].values(), searchContainerMaps[1].values(), query.maxDistance); + process.yield(JOIN, rcLocal.size()); + this.localcount = rcLocal.size(); this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation); this.rankedCache.insert(rcLocal, true); @@ -247,7 +260,9 @@ public final class plasmaSearchEvent { public void run() { // do a local search - Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null); + process.startTimer(); + Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null); + process.yield(COLLECTION, searchContainerMaps[0].size()); // use the search containers to fill up rcAbstracts locally /* @@ -275,13 +290,15 @@ public final class plasmaSearchEvent { */ // join and exlcude the local result + process.startTimer(); indexContainer rcLocal = (searchContainerMaps == null) ? plasmaWordIndex.emptyContainer(null, 0) : - process.localSearchJoinExclude( + indexContainer.joinExcludeContainers( searchContainerMaps[0].values(), searchContainerMaps[1].values(), query.maxDistance); + process.yield(JOIN, rcLocal.size()); localcount = rcLocal.size(); // sort the local containers and truncate it to a limited count, @@ -454,7 +471,7 @@ public final class plasmaSearchEvent { return ranking; } - public plasmaSearchProcessing getProcess() { + public serverProfiling getProcess() { return process; } @@ -490,7 +507,7 @@ public final class plasmaSearchEvent { public static plasmaSearchEvent getEvent(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, - plasmaSearchProcessing localTiming, + serverProfiling localTiming, plasmaWordIndex wordIndex, TreeMap preselectedPeerHashes, boolean generateAbstracts, diff --git a/source/de/anomic/plasma/plasmaSearchProcessing.java b/source/de/anomic/plasma/plasmaSearchProcessing.java deleted file mode 100644 index 8f417bc38..000000000 --- a/source/de/anomic/plasma/plasmaSearchProcessing.java +++ /dev/null @@ -1,253 +0,0 @@ -// plasmaSearchProcessing.java -// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 17.10.2005 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.plasma; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; - -import de.anomic.index.indexContainer; -import de.anomic.index.indexRWIEntry; -import de.anomic.server.serverByteBuffer; - -/** - * - * This class provides search processes and keeps a timing record of the processes - * It shall be used to initiate a search and also to evaluate - * the real obtained timings after a search is performed - */ - -public class plasmaSearchProcessing implements Cloneable { - - // collection: - // time = time to get a RWI out of RAM cache, assortments and WORDS files - // count = maximum number of RWI-entries that shall be collected - - // join - // time = time to perform the join between all collected RWIs - // count = maximum number of entries that shall be joined - - // presort: - // time = time to do a sort of the joined URL-records - // count = maximum number of entries that shall be pre-sorted - - // urlfetch: - // time = time to fetch the real URLs from the LURL database - // count = maximum number of urls that shall be fetched - - // postsort: - // time = time for final sort of URLs - // count = maximum number oof URLs that shall be retrieved during sort - - // snippetfetch: - // time = time to fetch snippets for selected URLs - // count = maximum number of snipptes to be fetched - - public static final String COLLECTION = "collection"; - public static final String JOIN = "join"; - public static final String PRESORT = "presort"; - public static final String URLFETCH = "urlfetch"; - - private static final long minimumTargetTime = 100; - - private long targetTime; - private int targetCount; - private ArrayList yield; - private long timer; - - private plasmaSearchProcessing() { - targetTime = minimumTargetTime; - targetCount = 10; - yield = new ArrayList(); - timer = 0; - } - - public plasmaSearchProcessing(long time, int count) { - this(); - this.targetTime = time; - this.targetCount = count; - } - - public static class Entry { - public String process; - public int count; - public long time; - public Entry(String process, int count, long time) { - this.process = process; - this.count = count; - this.time = time; - } - } - - public int getTargetCount() { - return this.targetCount; - } - - public long getTargetTime() { - return this.targetTime; - } - - public void startTimer() { - this.timer = System.currentTimeMillis(); - } - - public void yield(String s, int count) { - long t = System.currentTimeMillis() - this.timer; - Entry e = new Entry(s, count, t); - yield.add(e); - } - - public Iterator events() { - // iteratese Entry-type Objects - return yield.iterator(); - } - - public int size() { - // returns number of events / Entry-Objects in yield array - return yield.size(); - } - - public Map[] localSearchContainers( - plasmaSearchQuery query, - plasmaWordIndex wordIndex, - Set urlselection) { - // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result - - // retrieve entities that belong to the hashes - startTimer(); - Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers( - query.queryHashes, - urlselection, - true, - true); - if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned - Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0)) ? new HashMap() : wordIndex.getContainers( - query.excludeHashes, - urlselection, - true, - true); - yield(plasmaSearchProcessing.COLLECTION, inclusionContainers.size()); - - return new Map[]{inclusionContainers, exclusionContainers}; - } - - public indexContainer localSearchJoinExclude( - Collection includeContainers, - Collection excludeContainers, - int maxDistance) { - // join a search result and return the joincount (number of pages after join) - - // since this is a conjunction we return an empty entity if any word is not known - if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0); - - // join the result - startTimer(); - indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance); - if (rcLocal != null) { - indexContainer.excludeContainers(rcLocal, excludeContainers); - } - if (rcLocal == null) rcLocal = plasmaWordIndex.emptyContainer(null, 0); - yield(plasmaSearchProcessing.JOIN, rcLocal.size()); - - return rcLocal; - } - - - - public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) { - // collect references according to domains - long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; - TreeMap doms = new TreeMap(); - synchronized (inputContainer) { - Iterator i = inputContainer.entries(); - indexRWIEntry iEntry; - String dom, paths; - while (i.hasNext()) { - iEntry = (indexRWIEntry) i.next(); - if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer - dom = iEntry.urlHash().substring(6); - if ((paths = (String) doms.get(dom)) == null) { - doms.put(dom, iEntry.urlHash().substring(0, 6)); - } else { - doms.put(dom, paths + iEntry.urlHash().substring(0, 6)); - } - if (System.currentTimeMillis() > timeout) - break; - } - } - // construct a result string - serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6); - bb.append('{'); - Iterator i = doms.entrySet().iterator(); - Map.Entry entry; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - bb.append((String) entry.getKey()); - bb.append(':'); - bb.append((String) entry.getValue()); - if (System.currentTimeMillis() > timeout) - break; - if (i.hasNext()) - bb.append(','); - } - bb.append('}'); - return bb; - } - - public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) { - // target is a mapping from url-hashes to a string of peer-hashes - if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) { - //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString()); - ci = ci.trim(1, ci.length() - 2); - String dom, url, peers; - while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) { - assert ci.length() >= 6 : "ci.length() = " + ci.length(); - dom = ci.toString(0, 6); - ci.trim(7); - while ((ci.length() > 0) && (ci.byteAt(0) != ',')) { - assert ci.length() >= 6 : "ci.length() = " + ci.length(); - url = ci.toString(0, 6) + dom; - ci.trim(6); - peers = (String) target.get(url); - if (peers == null) { - target.put(url, peerhash); - } else { - target.put(url, peers + peerhash); - } - //System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url)); - } - if (ci.byteAt(0) == ',') ci.trim(1); - } - } - } - - -} diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 80df289a1..400c6b882 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -44,6 +44,7 @@ import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; +import de.anomic.server.serverProfiling; import de.anomic.yacy.yacyURL; public final class plasmaSearchRankingProcess { @@ -56,14 +57,14 @@ public final class plasmaSearchRankingProcess { private plasmaSearchRankingProfile ranking; private int filteredCount; private indexRWIEntryOrder order; - private plasmaSearchProcessing process; + private serverProfiling process; private int maxentries; private int globalcount; private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic private int[] c; // flag counter - public plasmaSearchRankingProcess(plasmaSearchQuery query, plasmaSearchProcessing process, plasmaSearchRankingProfile ranking, int maxentries) { + public plasmaSearchRankingProcess(plasmaSearchQuery query, serverProfiling process, plasmaSearchRankingProfile ranking, int maxentries) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime this.pageAcc = new TreeMap(); @@ -91,7 +92,7 @@ public final class plasmaSearchRankingProcess { this.order = new indexRWIEntryOrder(ranking); } this.order.extend(container); - if (process != null) process.yield("normalizing", container.size()); + if (process != null) process.yield(plasmaSearchEvent.NORMALIZING, container.size()); /* container.setOrdering(o, 0); @@ -115,7 +116,7 @@ public final class plasmaSearchRankingProcess { if (iEntry.flags().get(j)) {c[j]++;} } - // kick out entries that are too bad acording to current findings + // kick out entries that are too bad according to current findings r = new Long(order.cardinal(iEntry)); if ((maxentries >= 0) && (pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue; @@ -154,7 +155,7 @@ public final class plasmaSearchRankingProcess { if (container.size() > query.neededResults()) remove(true, true); - if (process != null) process.yield(plasmaSearchProcessing.PRESORT, container.size()); + if (process != null) process.yield(plasmaSearchEvent.PRESORT, container.size()); } public class rIterator implements Iterator { diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index f772caae7..264bb479d 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -389,6 +389,24 @@ public final class plasmaWordIndex implements indexRI { return containers; } + public Map[] localSearchContainers(plasmaSearchQuery query, Set urlselection) { + // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result + + // retrieve entities that belong to the hashes + Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : getContainers( + query.queryHashes, + urlselection, + true, + true); + if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned + Map exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap() : getContainers( + query.excludeHashes, + urlselection, + true, + true); + return new Map[]{inclusionContainers, exclusionContainers}; + } + public Finding retrieveURLs(plasmaSearchQuery query, boolean loadurl, int sortorder, plasmaSearchRankingProfile ranking) { // search for a word hash and generate a list of url links // sortorder: 0 = hash, 1 = url, 2 = ranking diff --git a/source/de/anomic/server/serverProfiling.java b/source/de/anomic/server/serverProfiling.java new file mode 100644 index 000000000..89872caeb --- /dev/null +++ b/source/de/anomic/server/serverProfiling.java @@ -0,0 +1,93 @@ +// serverProfiling.java +// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 17.11.2007 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.server; + +import java.util.ArrayList; +import java.util.Iterator; + +public class serverProfiling implements Cloneable { + + private static final long minimumTargetTime = 100; + private long targetTime; + private int targetCount; + private ArrayList yield; + private long timer; + + private serverProfiling() { + targetTime = minimumTargetTime; + targetCount = 10; + yield = new ArrayList(); + timer = 0; + } + + public serverProfiling(long time, int count) { + this(); + this.targetTime = time; + this.targetCount = count; + } + + public static class Entry { + public String process; + public int count; + public long time; + + public Entry(String process, int count, long time) { + this.process = process; + this.count = count; + this.time = time; + } + } + + public int getTargetCount() { + return this.targetCount; + } + + public long getTargetTime() { + return this.targetTime; + } + + public void startTimer() { + this.timer = System.currentTimeMillis(); + } + + public void yield(String s, int count) { + long t = System.currentTimeMillis() - this.timer; + Entry e = new Entry(s, count, t); + yield.add(e); + } + + public Iterator events() { + // iteratese Entry-type Objects + return yield.iterator(); + } + + public int size() { + // returns number of events / Entry-Objects in yield array + return yield.size(); + } + +} diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index babbea9b0..6a40273a1 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -60,7 +60,6 @@ import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaSearchRankingProcess; -import de.anomic.plasma.plasmaSearchProcessing; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; @@ -549,7 +548,7 @@ public final class yacyClient { if (singleAbstract == null) singleAbstract = new TreeMap(); ci = new serverByteBuffer(((String) entry.getValue()).getBytes()); //System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString()); - plasmaSearchProcessing.decompressIndex(singleAbstract, ci, target.hash); + indexContainer.decompressIndex(singleAbstract, ci, target.hash); abstractCache.put(wordhash, singleAbstract); } } diff --git a/source/de/anomic/yacy/yacySeed.java b/source/de/anomic/yacy/yacySeed.java index 60168114f..6bc14649d 100644 --- a/source/de/anomic/yacy/yacySeed.java +++ b/source/de/anomic/yacy/yacySeed.java @@ -670,7 +670,7 @@ public class yacySeed { return type.equals(yacySeed.PEERTYPE_SENIOR) || type.equals(yacySeed.PEERTYPE_PRINCIPAL); } - public static final long minDHTNumber = kelondroBase64Order.enhancedCoder.cardinal("AAAAAAAAAAAA".getBytes()); + public static final long minDHTNumber = kelondroBase64Order.enhancedCoder.cardinal(kelondroBase64Order.zero(12)); public static final long maxDHTDistance = Long.MAX_VALUE; public double dhtPosition() {
URL String#[urlNormalform]#
URL String#[urlNormalform]#
Hash#[urlhash]#
Description#[urlDescr]#
Modified-Date#[moddate]#
Loaded-Date#[loaddate]#
Referrer#[referrer]#
Referrerunknown
Referrer#[url]#
Doctype#[doctype]#
Language#[language]#
Size#[size]#