From 6f1308da2f6cfff80dbf378b575c966f094261cf Mon Sep 17 00:00:00 2001
From: orbiter
Date: Sat, 17 Nov 2007 01:53:02 +0000
Subject: [PATCH] - some enhancements to IndexControlURLs (shows more links,
connects referrer to another query) - some refactoring to search process
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4222 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/IndexControlRWIs_p.html | 32 +--
htroot/IndexControlURLs_p.html | 8 +-
htroot/IndexControlURLs_p.java | 10 +-
htroot/PerformanceSearch_p.java | 7 +-
htroot/yacy/search.java | 14 +-
htroot/yacysearch.java | 4 +-
source/de/anomic/index/indexContainer.java | 87 ++++++
.../de/anomic/plasma/plasmaSearchEvent.java | 39 ++-
.../anomic/plasma/plasmaSearchProcessing.java | 253 ------------------
.../plasma/plasmaSearchRankingProcess.java | 11 +-
source/de/anomic/plasma/plasmaWordIndex.java | 18 ++
source/de/anomic/server/serverProfiling.java | 93 +++++++
source/de/anomic/yacy/yacyClient.java | 3 +-
source/de/anomic/yacy/yacySeed.java | 2 +-
14 files changed, 272 insertions(+), 309 deletions(-)
delete mode 100644 source/de/anomic/plasma/plasmaSearchProcessing.java
create mode 100644 source/de/anomic/server/serverProfiling.java
diff --git a/htroot/IndexControlRWIs_p.html b/htroot/IndexControlRWIs_p.html
index 565b72c87..25149cefc 100644
--- a/htroot/IndexControlRWIs_p.html
+++ b/htroot/IndexControlRWIs_p.html
@@ -30,16 +30,16 @@
No entry for word '#[word]#'
::
No entry for word hash #[wordhash]#
::
Search result:
-
diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html
index cd7320736..11accd815 100644
--- a/htroot/IndexControlURLs_p.html
+++ b/htroot/IndexControlURLs_p.html
@@ -35,12 +35,16 @@
#(genUrlProfile)#
::No entry found for URL-hash #[urlhash]#
::
- URL String | #[urlNormalform]# |
+ URL String | #[urlNormalform]# |
Hash | #[urlhash]# |
Description | #[urlDescr]# |
Modified-Date | #[moddate]# |
Loaded-Date | #[loaddate]# |
- Referrer | #[referrer]# |
+ #(referrer)#
+ Referrer | unknown |
+ ::
+ Referrer | #[url]# |
+ #(/referrer)#
Doctype | #[doctype]# |
Language | #[language]# |
Size | #[size]# |
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java
index d92076d38..5fdd7013f 100644
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@@ -171,13 +171,7 @@ public class IndexControlURLs_p {
return prop;
}
indexURLEntry.Components comp = entry.comp();
- String referrer = null;
indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0);
- if (le == null) {
- referrer = "";
- } else {
- referrer = le.comp().url().toNormalform(false, true);
- }
if (comp.url() == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);
@@ -189,7 +183,9 @@ public class IndexControlURLs_p {
prop.put("genUrlProfile_urlDescr", comp.title());
prop.put("genUrlProfile_moddate", entry.moddate());
prop.put("genUrlProfile_loaddate", entry.loaddate());
- prop.putHTML("genUrlProfile_referrer", referrer);
+ prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1);
+ prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "" : le.comp().url().toNormalform(false, true));
+ prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : le.hash());
prop.put("genUrlProfile_doctype", ""+entry.doctype());
prop.put("genUrlProfile_language", entry.language());
prop.put("genUrlProfile_size", entry.size());
diff --git a/htroot/PerformanceSearch_p.java b/htroot/PerformanceSearch_p.java
index 349dc1888..d747746ca 100644
--- a/htroot/PerformanceSearch_p.java
+++ b/htroot/PerformanceSearch_p.java
@@ -24,13 +24,12 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
import java.util.Iterator;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSearchEvent;
-import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.server.serverObjects;
+import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch;
public class PerformanceSearch_p {
@@ -47,9 +46,9 @@ public class PerformanceSearch_p {
Iterator events = se.getProcess().events();
int c = 0;
- plasmaSearchProcessing.Entry event;
+ serverProfiling.Entry event;
while (events.hasNext()) {
- event = (plasmaSearchProcessing.Entry) events.next();
+ event = (serverProfiling.Entry) events.next();
prop.put("table_" + c + "_event", event.process);
prop.putNum("table_" + c + "_count", event.count);
prop.putNum("table_" + c + "_time", event.time);
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index 0ddf35061..e3820db35 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -43,10 +43,10 @@ import de.anomic.net.natLib;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
-import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
+import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
@@ -128,7 +128,7 @@ public final class search {
int indexabstractContainercount = 0;
int joincount = 0;
plasmaSearchQuery theQuery = null;
- plasmaSearchProcessing localProcess = null;
+ serverProfiling localProcess = null;
ArrayList accu = null;
long urlRetrievalAllTime = 0, snippetComputationAllTime = 0;
if ((query.length() == 0) && (abstractSet != null)) {
@@ -138,10 +138,12 @@ public final class search {
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
// prepare a search profile
- localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults());
+ localProcess = new serverProfiling(theQuery.maximumTime, theQuery.displayResults());
//theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, sb.wordIndex, null);
- Map[] containers = localProcess.localSearchContainers(theQuery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls));
+ localProcess.startTimer();
+ Map[] containers = sb.wordIndex.localSearchContainers(theQuery, plasmaSearchQuery.hashes2Set(urls));
+ localProcess.yield(plasmaSearchEvent.COLLECTION, containers[0].size());
if (containers != null) {
Iterator ci = containers[0].entrySet().iterator();
Map.Entry entry;
@@ -151,7 +153,7 @@ public final class search {
wordhash = (String) entry.getKey();
indexContainer container = (indexContainer) entry.getValue();
indexabstractContainercount += container.size();
- indexabstract.append("indexabstract." + wordhash + "=").append(plasmaSearchProcessing.compressIndex(container, null, 1000).toString()).append(serverCore.crlfString);
+ indexabstract.append("indexabstract." + wordhash + "=").append(indexContainer.compressIndex(container, null, 1000).toString()).append(serverCore.crlfString);
}
}
@@ -168,7 +170,7 @@ public final class search {
// prepare a search profile
plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile);
- localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults());
+ localProcess = new serverProfiling(theQuery.maximumTime, theQuery.displayResults());
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, localProcess, sb.wordIndex, null, true, abstractSet);
urlRetrievalAllTime = theSearch.getURLRetrievalTime();
snippetComputationAllTime = theSearch.getSnippetComputationTime();
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index a78d1cdd0..71c111d96 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -59,12 +59,12 @@ import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
-import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
+import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.yFormatter;
@@ -268,7 +268,7 @@ public class yacysearch {
20,
constraint,
false);
- plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * theQuery.maximumTime / 10, theQuery.displayResults());
+ serverProfiling localTiming = new serverProfiling(4 * theQuery.maximumTime / 10, theQuery.displayResults());
String client = (String) header.get("CLIENTIP"); // the search client who initiated the search
diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java
index df8f30263..43e906124 100644
--- a/source/de/anomic/index/indexContainer.java
+++ b/source/de/anomic/index/indexContainer.java
@@ -30,12 +30,15 @@ import java.lang.reflect.Method;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
+import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
+import de.anomic.plasma.plasmaWordIndex;
+import de.anomic.server.serverByteBuffer;
public class indexContainer extends kelondroRowSet {
@@ -206,6 +209,23 @@ public class indexContainer extends kelondroRowSet {
}
}
+ public static indexContainer joinExcludeContainers(
+ Collection includeContainers,
+ Collection excludeContainers,
+ int maxDistance) {
+ // join a search result and return the joincount (number of pages after join)
+
+ // since this is a conjunction we return an empty entity if any word is not known
+ if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);
+
+ // join the result
+ indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance);
+ if (rcLocal == null) return plasmaWordIndex.emptyContainer(null, 0);
+ excludeContainers(rcLocal, excludeContainers);
+
+ return rcLocal;
+ }
+
public static indexContainer joinContainers(Collection containers, int maxDistance) {
// order entities by their size
@@ -433,4 +453,71 @@ public class indexContainer extends kelondroRowSet {
return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4));
}
+
+ public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) {
+ // collect references according to domains
+ long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
+ TreeMap doms = new TreeMap();
+ synchronized (inputContainer) {
+ Iterator i = inputContainer.entries();
+ indexRWIEntry iEntry;
+ String dom, paths;
+ while (i.hasNext()) {
+ iEntry = (indexRWIEntry) i.next();
+ if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
+ dom = iEntry.urlHash().substring(6);
+ if ((paths = (String) doms.get(dom)) == null) {
+ doms.put(dom, iEntry.urlHash().substring(0, 6));
+ } else {
+ doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
+ }
+ if (System.currentTimeMillis() > timeout)
+ break;
+ }
+ }
+ // construct a result string
+ serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6);
+ bb.append('{');
+ Iterator i = doms.entrySet().iterator();
+ Map.Entry entry;
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ bb.append((String) entry.getKey());
+ bb.append(':');
+ bb.append((String) entry.getValue());
+ if (System.currentTimeMillis() > timeout)
+ break;
+ if (i.hasNext())
+ bb.append(',');
+ }
+ bb.append('}');
+ return bb;
+ }
+
+ public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) {
+ // target is a mapping from url-hashes to a string of peer-hashes
+ if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) {
+ //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
+ ci = ci.trim(1, ci.length() - 2);
+ String dom, url, peers;
+ while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
+ assert ci.length() >= 6 : "ci.length() = " + ci.length();
+ dom = ci.toString(0, 6);
+ ci.trim(7);
+ while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
+ assert ci.length() >= 6 : "ci.length() = " + ci.length();
+ url = ci.toString(0, 6) + dom;
+ ci.trim(6);
+ peers = (String) target.get(url);
+ if (peers == null) {
+ target.put(url, peerhash);
+ } else {
+ target.put(url, peers + peerhash);
+ }
+ //System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
+ }
+ if (ci.byteAt(0) == ',') ci.trim(1);
+ }
+ }
+ }
}
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index a05ed6383..7de4a6a55 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -41,6 +41,7 @@ import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
+import de.anomic.server.serverProfiling;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyDHTAction;
@@ -50,7 +51,13 @@ import de.anomic.yacy.yacyURL;
public final class plasmaSearchEvent {
- public static int workerThreadCount = 10;
+ public static final String COLLECTION = "collection";
+ public static final String JOIN = "join";
+ public static final String PRESORT = "presort";
+ public static final String URLFETCH = "urlfetch";
+ public static final String NORMALIZING = "normalizing";
+
+ public static int workerThreadCount = 3;
public static String lastEventID = "";
private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests
public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes
@@ -62,7 +69,7 @@ public final class plasmaSearchEvent {
private plasmaWordIndex wordIndex;
private plasmaSearchRankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
- private plasmaSearchProcessing process;
+ private serverProfiling process;
private yacySearch[] primarySearchThreads, secondarySearchThreads;
private Thread localSearchThread;
private TreeMap preselectedPeerHashes;
@@ -80,7 +87,7 @@ public final class plasmaSearchEvent {
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
- plasmaSearchProcessing localTiming,
+ serverProfiling localTiming,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes,
boolean generateAbstracts,
@@ -117,13 +124,13 @@ public final class plasmaSearchEvent {
long start = System.currentTimeMillis();
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
+ // do a global search
this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation);
int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds
if (fetchpeers > 50) fetchpeers = 50;
if (fetchpeers < 30) fetchpeers = 30;
- // do a global search
// the result of the fetch is then in the rcGlobal
process.startTimer();
serverLog.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs");
@@ -152,7 +159,10 @@ public final class plasmaSearchEvent {
// finished searching
serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
} else {
- Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null);
+ // do a local search
+ process.startTimer();
+ Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null);
+ process.yield(COLLECTION, searchContainerMaps[0].size());
if (generateAbstracts) {
// compute index abstracts
@@ -178,18 +188,21 @@ public final class plasmaSearchEvent {
IAneardhthash = wordhash;
}
IACount.put(wordhash, new Integer(container.size()));
- IAResults.put(wordhash, plasmaSearchProcessing.compressIndex(container, null, 1000).toString());
+ IAResults.put(wordhash, indexContainer.compressIndex(container, null, 1000).toString());
}
process.yield("abstract generation", searchContainerMaps[0].size());
}
+ process.startTimer();
indexContainer rcLocal =
(searchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
- process.localSearchJoinExclude(
+ indexContainer.joinExcludeContainers(
searchContainerMaps[0].values(),
searchContainerMaps[1].values(),
query.maxDistance);
+ process.yield(JOIN, rcLocal.size());
+
this.localcount = rcLocal.size();
this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation);
this.rankedCache.insert(rcLocal, true);
@@ -247,7 +260,9 @@ public final class plasmaSearchEvent {
public void run() {
// do a local search
- Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null);
+ process.startTimer();
+ Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null);
+ process.yield(COLLECTION, searchContainerMaps[0].size());
// use the search containers to fill up rcAbstracts locally
/*
@@ -275,13 +290,15 @@ public final class plasmaSearchEvent {
*/
// join and exlcude the local result
+ process.startTimer();
indexContainer rcLocal =
(searchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
- process.localSearchJoinExclude(
+ indexContainer.joinExcludeContainers(
searchContainerMaps[0].values(),
searchContainerMaps[1].values(),
query.maxDistance);
+ process.yield(JOIN, rcLocal.size());
localcount = rcLocal.size();
// sort the local containers and truncate it to a limited count,
@@ -454,7 +471,7 @@ public final class plasmaSearchEvent {
return ranking;
}
- public plasmaSearchProcessing getProcess() {
+ public serverProfiling getProcess() {
return process;
}
@@ -490,7 +507,7 @@ public final class plasmaSearchEvent {
public static plasmaSearchEvent getEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
- plasmaSearchProcessing localTiming,
+ serverProfiling localTiming,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes,
boolean generateAbstracts,
diff --git a/source/de/anomic/plasma/plasmaSearchProcessing.java b/source/de/anomic/plasma/plasmaSearchProcessing.java
deleted file mode 100644
index 8f417bc38..000000000
--- a/source/de/anomic/plasma/plasmaSearchProcessing.java
+++ /dev/null
@@ -1,253 +0,0 @@
-// plasmaSearchProcessing.java
-// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 17.10.2005 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
-// $LastChangedRevision: 1986 $
-// $LastChangedBy: orbiter $
-//
-// LICENSE
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-package de.anomic.plasma;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-
-import de.anomic.index.indexContainer;
-import de.anomic.index.indexRWIEntry;
-import de.anomic.server.serverByteBuffer;
-
-/**
- *
- * This class provides search processes and keeps a timing record of the processes
- * It shall be used to initiate a search and also to evaluate
- * the real obtained timings after a search is performed
- */
-
-public class plasmaSearchProcessing implements Cloneable {
-
- // collection:
- // time = time to get a RWI out of RAM cache, assortments and WORDS files
- // count = maximum number of RWI-entries that shall be collected
-
- // join
- // time = time to perform the join between all collected RWIs
- // count = maximum number of entries that shall be joined
-
- // presort:
- // time = time to do a sort of the joined URL-records
- // count = maximum number of entries that shall be pre-sorted
-
- // urlfetch:
- // time = time to fetch the real URLs from the LURL database
- // count = maximum number of urls that shall be fetched
-
- // postsort:
- // time = time for final sort of URLs
- // count = maximum number oof URLs that shall be retrieved during sort
-
- // snippetfetch:
- // time = time to fetch snippets for selected URLs
- // count = maximum number of snipptes to be fetched
-
- public static final String COLLECTION = "collection";
- public static final String JOIN = "join";
- public static final String PRESORT = "presort";
- public static final String URLFETCH = "urlfetch";
-
- private static final long minimumTargetTime = 100;
-
- private long targetTime;
- private int targetCount;
- private ArrayList yield;
- private long timer;
-
- private plasmaSearchProcessing() {
- targetTime = minimumTargetTime;
- targetCount = 10;
- yield = new ArrayList();
- timer = 0;
- }
-
- public plasmaSearchProcessing(long time, int count) {
- this();
- this.targetTime = time;
- this.targetCount = count;
- }
-
- public static class Entry {
- public String process;
- public int count;
- public long time;
- public Entry(String process, int count, long time) {
- this.process = process;
- this.count = count;
- this.time = time;
- }
- }
-
- public int getTargetCount() {
- return this.targetCount;
- }
-
- public long getTargetTime() {
- return this.targetTime;
- }
-
- public void startTimer() {
- this.timer = System.currentTimeMillis();
- }
-
- public void yield(String s, int count) {
- long t = System.currentTimeMillis() - this.timer;
- Entry e = new Entry(s, count, t);
- yield.add(e);
- }
-
- public Iterator events() {
- // iteratese Entry-type Objects
- return yield.iterator();
- }
-
- public int size() {
- // returns number of events / Entry-Objects in yield array
- return yield.size();
- }
-
- public Map[] localSearchContainers(
- plasmaSearchQuery query,
- plasmaWordIndex wordIndex,
- Set urlselection) {
- // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
-
- // retrieve entities that belong to the hashes
- startTimer();
- Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers(
- query.queryHashes,
- urlselection,
- true,
- true);
- if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned
- Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0)) ? new HashMap() : wordIndex.getContainers(
- query.excludeHashes,
- urlselection,
- true,
- true);
- yield(plasmaSearchProcessing.COLLECTION, inclusionContainers.size());
-
- return new Map[]{inclusionContainers, exclusionContainers};
- }
-
- public indexContainer localSearchJoinExclude(
- Collection includeContainers,
- Collection excludeContainers,
- int maxDistance) {
- // join a search result and return the joincount (number of pages after join)
-
- // since this is a conjunction we return an empty entity if any word is not known
- if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);
-
- // join the result
- startTimer();
- indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance);
- if (rcLocal != null) {
- indexContainer.excludeContainers(rcLocal, excludeContainers);
- }
- if (rcLocal == null) rcLocal = plasmaWordIndex.emptyContainer(null, 0);
- yield(plasmaSearchProcessing.JOIN, rcLocal.size());
-
- return rcLocal;
- }
-
-
-
- public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) {
- // collect references according to domains
- long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
- TreeMap doms = new TreeMap();
- synchronized (inputContainer) {
- Iterator i = inputContainer.entries();
- indexRWIEntry iEntry;
- String dom, paths;
- while (i.hasNext()) {
- iEntry = (indexRWIEntry) i.next();
- if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
- dom = iEntry.urlHash().substring(6);
- if ((paths = (String) doms.get(dom)) == null) {
- doms.put(dom, iEntry.urlHash().substring(0, 6));
- } else {
- doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
- }
- if (System.currentTimeMillis() > timeout)
- break;
- }
- }
- // construct a result string
- serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6);
- bb.append('{');
- Iterator i = doms.entrySet().iterator();
- Map.Entry entry;
- while (i.hasNext()) {
- entry = (Map.Entry) i.next();
- bb.append((String) entry.getKey());
- bb.append(':');
- bb.append((String) entry.getValue());
- if (System.currentTimeMillis() > timeout)
- break;
- if (i.hasNext())
- bb.append(',');
- }
- bb.append('}');
- return bb;
- }
-
- public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) {
- // target is a mapping from url-hashes to a string of peer-hashes
- if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) {
- //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
- ci = ci.trim(1, ci.length() - 2);
- String dom, url, peers;
- while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
- assert ci.length() >= 6 : "ci.length() = " + ci.length();
- dom = ci.toString(0, 6);
- ci.trim(7);
- while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
- assert ci.length() >= 6 : "ci.length() = " + ci.length();
- url = ci.toString(0, 6) + dom;
- ci.trim(6);
- peers = (String) target.get(url);
- if (peers == null) {
- target.put(url, peerhash);
- } else {
- target.put(url, peers + peerhash);
- }
- //System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
- }
- if (ci.byteAt(0) == ',') ci.trim(1);
- }
- }
- }
-
-
-}
diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java
index 80df289a1..400c6b882 100644
--- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java
@@ -44,6 +44,7 @@ import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
+import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacyURL;
public final class plasmaSearchRankingProcess {
@@ -56,14 +57,14 @@ public final class plasmaSearchRankingProcess {
private plasmaSearchRankingProfile ranking;
private int filteredCount;
private indexRWIEntryOrder order;
- private plasmaSearchProcessing process;
+ private serverProfiling process;
private int maxentries;
private int globalcount;
private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private int[] c; // flag counter
- public plasmaSearchRankingProcess(plasmaSearchQuery query, plasmaSearchProcessing process, plasmaSearchRankingProfile ranking, int maxentries) {
+ public plasmaSearchRankingProcess(plasmaSearchQuery query, serverProfiling process, plasmaSearchRankingProfile ranking, int maxentries) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
this.pageAcc = new TreeMap();
@@ -91,7 +92,7 @@ public final class plasmaSearchRankingProcess {
this.order = new indexRWIEntryOrder(ranking);
}
this.order.extend(container);
- if (process != null) process.yield("normalizing", container.size());
+ if (process != null) process.yield(plasmaSearchEvent.NORMALIZING, container.size());
/*
container.setOrdering(o, 0);
@@ -115,7 +116,7 @@ public final class plasmaSearchRankingProcess {
if (iEntry.flags().get(j)) {c[j]++;}
}
- // kick out entries that are too bad acording to current findings
+ // kick out entries that are too bad according to current findings
r = new Long(order.cardinal(iEntry));
if ((maxentries >= 0) && (pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue;
@@ -154,7 +155,7 @@ public final class plasmaSearchRankingProcess {
if (container.size() > query.neededResults()) remove(true, true);
- if (process != null) process.yield(plasmaSearchProcessing.PRESORT, container.size());
+ if (process != null) process.yield(plasmaSearchEvent.PRESORT, container.size());
}
public class rIterator implements Iterator {
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index f772caae7..264bb479d 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -389,6 +389,24 @@ public final class plasmaWordIndex implements indexRI {
return containers;
}
+ public Map[] localSearchContainers(plasmaSearchQuery query, Set urlselection) {
+ // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
+
+ // retrieve entities that belong to the hashes
+ Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : getContainers(
+ query.queryHashes,
+ urlselection,
+ true,
+ true);
+ if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned
+ Map exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap() : getContainers(
+ query.excludeHashes,
+ urlselection,
+ true,
+ true);
+ return new Map[]{inclusionContainers, exclusionContainers};
+ }
+
public Finding retrieveURLs(plasmaSearchQuery query, boolean loadurl, int sortorder, plasmaSearchRankingProfile ranking) {
// search for a word hash and generate a list of url links
// sortorder: 0 = hash, 1 = url, 2 = ranking
diff --git a/source/de/anomic/server/serverProfiling.java b/source/de/anomic/server/serverProfiling.java
new file mode 100644
index 000000000..89872caeb
--- /dev/null
+++ b/source/de/anomic/server/serverProfiling.java
@@ -0,0 +1,93 @@
+// serverProfiling.java
+// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 17.11.2007 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
+// $LastChangedRevision: 1986 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package de.anomic.server;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+
+public class serverProfiling implements Cloneable {
+
+ private static final long minimumTargetTime = 100;
+ private long targetTime;
+ private int targetCount;
+ private ArrayList yield;
+ private long timer;
+
+ private serverProfiling() {
+ targetTime = minimumTargetTime;
+ targetCount = 10;
+ yield = new ArrayList();
+ timer = 0;
+ }
+
+ public serverProfiling(long time, int count) {
+ this();
+ this.targetTime = time;
+ this.targetCount = count;
+ }
+
+ public static class Entry {
+ public String process;
+ public int count;
+ public long time;
+
+ public Entry(String process, int count, long time) {
+ this.process = process;
+ this.count = count;
+ this.time = time;
+ }
+ }
+
+ public int getTargetCount() {
+ return this.targetCount;
+ }
+
+ public long getTargetTime() {
+ return this.targetTime;
+ }
+
+ public void startTimer() {
+ this.timer = System.currentTimeMillis();
+ }
+
+ public void yield(String s, int count) {
+ long t = System.currentTimeMillis() - this.timer;
+ Entry e = new Entry(s, count, t);
+ yield.add(e);
+ }
+
+ public Iterator events() {
+ // iteratese Entry-type Objects
+ return yield.iterator();
+ }
+
+ public int size() {
+ // returns number of events / Entry-Objects in yield array
+ return yield.size();
+ }
+
+}
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index babbea9b0..6a40273a1 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -60,7 +60,6 @@ import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchRankingProcess;
-import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
@@ -549,7 +548,7 @@ public final class yacyClient {
if (singleAbstract == null) singleAbstract = new TreeMap();
ci = new serverByteBuffer(((String) entry.getValue()).getBytes());
//System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString());
- plasmaSearchProcessing.decompressIndex(singleAbstract, ci, target.hash);
+ indexContainer.decompressIndex(singleAbstract, ci, target.hash);
abstractCache.put(wordhash, singleAbstract);
}
}
diff --git a/source/de/anomic/yacy/yacySeed.java b/source/de/anomic/yacy/yacySeed.java
index 60168114f..6bc14649d 100644
--- a/source/de/anomic/yacy/yacySeed.java
+++ b/source/de/anomic/yacy/yacySeed.java
@@ -670,7 +670,7 @@ public class yacySeed {
return type.equals(yacySeed.PEERTYPE_SENIOR) || type.equals(yacySeed.PEERTYPE_PRINCIPAL);
}
- public static final long minDHTNumber = kelondroBase64Order.enhancedCoder.cardinal("AAAAAAAAAAAA".getBytes());
+ public static final long minDHTNumber = kelondroBase64Order.enhancedCoder.cardinal(kelondroBase64Order.zero(12));
public static final long maxDHTDistance = Long.MAX_VALUE;
public double dhtPosition() {