- some enhancements to IndexControlURLs (shows more links, connects referrer to another query)

- some refactoring to search process

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4222 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent bf9a9e4e5e
commit 6f1308da2f

@ -30,16 +30,16 @@
<p>No entry for word '#[word]#'</p>::
<p>No entry for word hash #[wordhash]#</p>::
<p>Search result:
<form action="IndexControlRWIs_p.html" method="post" enctype="multipart/form-data">
<form name="selection" action="IndexControlRWIs_p.html" method="post" enctype="multipart/form-data">
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader">
<td>&nbsp;</td>
<td style="background-color:#FFFFFF">&nbsp;</td>
<td rowspan="2">total URLs</td>
<td colspan="6">appearance in</td>
<td colspan="4">in link type</td>
<td colspan="1">document type</td>
<tr class="TableCellDark">
<td>&nbsp;</td>
<td style="background-color:#FFFFFF">&nbsp;</td>
<td>reference</td>
<td>description</td>
<td>author</td>
@ -53,7 +53,7 @@
<td>index of</td>
</tr>
<tr class="TableCellDark">
<td>&nbsp;</td>
<td style="background-color:#FFFFFF">&nbsp;</td>
<td>#[allurl]#</td>
<td>#[reference]#</td>
<td>#[description]#</td>
@ -69,18 +69,18 @@
</tr>
<tr class="TableCellLight">
<td class="TableCellDark">Selection</td>
<td><input type="checkbox" name="allurl" checked="checked" /></td>
<td><input type="checkbox" name="reference" /></td>
<td><input type="checkbox" name="description" /></td>
<td><input type="checkbox" name="author" /></td>
<td><input type="checkbox" name="tag" /></td>
<td><input type="checkbox" name="url" /></td>
<td><input type="checkbox" name="emphasized" /></td>
<td><input type="checkbox" name="image" /></td>
<td><input type="checkbox" name="audio" /></td>
<td><input type="checkbox" name="video" /></td>
<td><input type="checkbox" name="app" /></td>
<td><input type="checkbox" name="indexof" /></td>
<td><input type="checkbox" name="allurl" id="allurl" checked="checked" /></td>
<td><input type="checkbox" name="reference" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="description" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="author" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="tag" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="url" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="emphasized" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="image" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="audio" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="video" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="app" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="indexof" onclick="document.selection.allurl.checked=false" /></td>
</tr>
</table>
</p>

@ -35,12 +35,16 @@
#(genUrlProfile)#
::No entry found for URL-hash #[urlhash]#
::<table>
<tr><td class="small">URL String</td><td class="tt">#[urlNormalform]#</td></tr>
<tr><td class="small">URL String</td><td class="tt"><a href="#[urlNormalform]#">#[urlNormalform]#</a></td></tr>
<tr><td class="small">Hash</td><td class="tt">#[urlhash]#</td></tr>
<tr><td class="small">Description</td><td class="tt">#[urlDescr]#</td></tr>
<tr><td class="small">Modified-Date</td><td class="tt">#[moddate]#</td></tr>
<tr><td class="small">Loaded-Date</td><td class="tt">#[loaddate]#</td></tr>
<tr><td class="small">Referrer</td><td class="tt">#[referrer]#</td></tr>
#(referrer)#
<tr><td class="small">Referrer</td><td class="tt">unknown</td></tr>
::
<tr><td class="small">Referrer</td><td class="tt"><a href="IndexControlURLs_p.html?urlhashsearch=&urlhash=#[hash]#">#[url]#</a></td></tr>
#(/referrer)#
<tr><td class="small">Doctype</td><td class="tt">#[doctype]#</td></tr>
<tr><td class="small">Language</td><td class="tt">#[language]#</td></tr>
<tr><td class="small">Size</td><td class="tt">#[size]#</td></tr>

@ -171,13 +171,7 @@ public class IndexControlURLs_p {
return prop;
}
indexURLEntry.Components comp = entry.comp();
String referrer = null;
indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0);
if (le == null) {
referrer = "<unknown>";
} else {
referrer = le.comp().url().toNormalform(false, true);
}
if (comp.url() == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);
@ -189,7 +183,9 @@ public class IndexControlURLs_p {
prop.put("genUrlProfile_urlDescr", comp.title());
prop.put("genUrlProfile_moddate", entry.moddate());
prop.put("genUrlProfile_loaddate", entry.loaddate());
prop.putHTML("genUrlProfile_referrer", referrer);
prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1);
prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "<unknown>" : le.comp().url().toNormalform(false, true));
prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : le.hash());
prop.put("genUrlProfile_doctype", ""+entry.doctype());
prop.put("genUrlProfile_language", entry.language());
prop.put("genUrlProfile_size", entry.size());

@ -24,13 +24,12 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Iterator;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch;
public class PerformanceSearch_p {
@ -47,9 +46,9 @@ public class PerformanceSearch_p {
Iterator events = se.getProcess().events();
int c = 0;
plasmaSearchProcessing.Entry event;
serverProfiling.Entry event;
while (events.hasNext()) {
event = (plasmaSearchProcessing.Entry) events.next();
event = (serverProfiling.Entry) events.next();
prop.put("table_" + c + "_event", event.process);
prop.putNum("table_" + c + "_count", event.count);
prop.putNum("table_" + c + "_time", event.time);

@ -43,10 +43,10 @@ import de.anomic.net.natLib;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
@ -128,7 +128,7 @@ public final class search {
int indexabstractContainercount = 0;
int joincount = 0;
plasmaSearchQuery theQuery = null;
plasmaSearchProcessing localProcess = null;
serverProfiling localProcess = null;
ArrayList accu = null;
long urlRetrievalAllTime = 0, snippetComputationAllTime = 0;
if ((query.length() == 0) && (abstractSet != null)) {
@ -138,10 +138,12 @@ public final class search {
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
// prepare a search profile
localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults());
localProcess = new serverProfiling(theQuery.maximumTime, theQuery.displayResults());
//theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, sb.wordIndex, null);
Map[] containers = localProcess.localSearchContainers(theQuery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls));
localProcess.startTimer();
Map[] containers = sb.wordIndex.localSearchContainers(theQuery, plasmaSearchQuery.hashes2Set(urls));
localProcess.yield(plasmaSearchEvent.COLLECTION, containers[0].size());
if (containers != null) {
Iterator ci = containers[0].entrySet().iterator();
Map.Entry entry;
@ -151,7 +153,7 @@ public final class search {
wordhash = (String) entry.getKey();
indexContainer container = (indexContainer) entry.getValue();
indexabstractContainercount += container.size();
indexabstract.append("indexabstract." + wordhash + "=").append(plasmaSearchProcessing.compressIndex(container, null, 1000).toString()).append(serverCore.crlfString);
indexabstract.append("indexabstract." + wordhash + "=").append(indexContainer.compressIndex(container, null, 1000).toString()).append(serverCore.crlfString);
}
}
@ -168,7 +170,7 @@ public final class search {
// prepare a search profile
plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile);
localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults());
localProcess = new serverProfiling(theQuery.maximumTime, theQuery.displayResults());
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, localProcess, sb.wordIndex, null, true, abstractSet);
urlRetrievalAllTime = theSearch.getURLRetrievalTime();
snippetComputationAllTime = theSearch.getSnippetComputationTime();

@ -59,12 +59,12 @@ import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.yFormatter;
@ -268,7 +268,7 @@ public class yacysearch {
20,
constraint,
false);
plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * theQuery.maximumTime / 10, theQuery.displayResults());
serverProfiling localTiming = new serverProfiling(4 * theQuery.maximumTime / 10, theQuery.displayResults());
String client = (String) header.get("CLIENTIP"); // the search client who initiated the search

@ -30,12 +30,15 @@ import java.lang.reflect.Method;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverByteBuffer;
public class indexContainer extends kelondroRowSet {
@ -206,6 +209,23 @@ public class indexContainer extends kelondroRowSet {
}
}
public static indexContainer joinExcludeContainers(
Collection includeContainers,
Collection excludeContainers,
int maxDistance) {
// join a search result and return the joincount (number of pages after join)
// since this is a conjunction we return an empty entity if any word is not known
if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);
// join the result
indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance);
if (rcLocal == null) return plasmaWordIndex.emptyContainer(null, 0);
excludeContainers(rcLocal, excludeContainers);
return rcLocal;
}
public static indexContainer joinContainers(Collection containers, int maxDistance) {
// order entities by their size
@ -433,4 +453,71 @@ public class indexContainer extends kelondroRowSet {
return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4));
}
public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) {
// collect references according to domains
long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
TreeMap doms = new TreeMap();
synchronized (inputContainer) {
Iterator i = inputContainer.entries();
indexRWIEntry iEntry;
String dom, paths;
while (i.hasNext()) {
iEntry = (indexRWIEntry) i.next();
if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
dom = iEntry.urlHash().substring(6);
if ((paths = (String) doms.get(dom)) == null) {
doms.put(dom, iEntry.urlHash().substring(0, 6));
} else {
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
}
if (System.currentTimeMillis() > timeout)
break;
}
}
// construct a result string
serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6);
bb.append('{');
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
bb.append((String) entry.getKey());
bb.append(':');
bb.append((String) entry.getValue());
if (System.currentTimeMillis() > timeout)
break;
if (i.hasNext())
bb.append(',');
}
bb.append('}');
return bb;
}
public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) {
// target is a mapping from url-hashes to a string of peer-hashes
if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) {
//System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
ci = ci.trim(1, ci.length() - 2);
String dom, url, peers;
while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
dom = ci.toString(0, 6);
ci.trim(7);
while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
url = ci.toString(0, 6) + dom;
ci.trim(6);
peers = (String) target.get(url);
if (peers == null) {
target.put(url, peerhash);
} else {
target.put(url, peers + peerhash);
}
//System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
}
if (ci.byteAt(0) == ',') ci.trim(1);
}
}
}
}

@ -41,6 +41,7 @@ import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.server.serverProfiling;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyDHTAction;
@ -50,7 +51,13 @@ import de.anomic.yacy.yacyURL;
public final class plasmaSearchEvent {
public static int workerThreadCount = 10;
public static final String COLLECTION = "collection";
public static final String JOIN = "join";
public static final String PRESORT = "presort";
public static final String URLFETCH = "urlfetch";
public static final String NORMALIZING = "normalizing";
public static int workerThreadCount = 3;
public static String lastEventID = "";
private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests
public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes
@ -62,7 +69,7 @@ public final class plasmaSearchEvent {
private plasmaWordIndex wordIndex;
private plasmaSearchRankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
private plasmaSearchProcessing process;
private serverProfiling process;
private yacySearch[] primarySearchThreads, secondarySearchThreads;
private Thread localSearchThread;
private TreeMap preselectedPeerHashes;
@ -80,7 +87,7 @@ public final class plasmaSearchEvent {
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaSearchProcessing localTiming,
serverProfiling localTiming,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes,
boolean generateAbstracts,
@ -117,13 +124,13 @@ public final class plasmaSearchEvent {
long start = System.currentTimeMillis();
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
// do a global search
this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation);
int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds
if (fetchpeers > 50) fetchpeers = 50;
if (fetchpeers < 30) fetchpeers = 30;
// do a global search
// the result of the fetch is then in the rcGlobal
process.startTimer();
serverLog.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs");
@ -152,7 +159,10 @@ public final class plasmaSearchEvent {
// finished searching
serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
} else {
Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null);
// do a local search
process.startTimer();
Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null);
process.yield(COLLECTION, searchContainerMaps[0].size());
if (generateAbstracts) {
// compute index abstracts
@ -178,18 +188,21 @@ public final class plasmaSearchEvent {
IAneardhthash = wordhash;
}
IACount.put(wordhash, new Integer(container.size()));
IAResults.put(wordhash, plasmaSearchProcessing.compressIndex(container, null, 1000).toString());
IAResults.put(wordhash, indexContainer.compressIndex(container, null, 1000).toString());
}
process.yield("abstract generation", searchContainerMaps[0].size());
}
process.startTimer();
indexContainer rcLocal =
(searchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
process.localSearchJoinExclude(
indexContainer.joinExcludeContainers(
searchContainerMaps[0].values(),
searchContainerMaps[1].values(),
query.maxDistance);
process.yield(JOIN, rcLocal.size());
this.localcount = rcLocal.size();
this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation);
this.rankedCache.insert(rcLocal, true);
@ -247,7 +260,9 @@ public final class plasmaSearchEvent {
public void run() {
// do a local search
Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null);
process.startTimer();
Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null);
process.yield(COLLECTION, searchContainerMaps[0].size());
// use the search containers to fill up rcAbstracts locally
/*
@ -275,13 +290,15 @@ public final class plasmaSearchEvent {
*/
// join and exlcude the local result
process.startTimer();
indexContainer rcLocal =
(searchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
process.localSearchJoinExclude(
indexContainer.joinExcludeContainers(
searchContainerMaps[0].values(),
searchContainerMaps[1].values(),
query.maxDistance);
process.yield(JOIN, rcLocal.size());
localcount = rcLocal.size();
// sort the local containers and truncate it to a limited count,
@ -454,7 +471,7 @@ public final class plasmaSearchEvent {
return ranking;
}
public plasmaSearchProcessing getProcess() {
public serverProfiling getProcess() {
return process;
}
@ -490,7 +507,7 @@ public final class plasmaSearchEvent {
public static plasmaSearchEvent getEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaSearchProcessing localTiming,
serverProfiling localTiming,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes,
boolean generateAbstracts,

@ -1,253 +0,0 @@
// plasmaSearchProcessing.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 17.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.server.serverByteBuffer;
/**
*
* This class provides search processes and keeps a timing record of the processes
* It shall be used to initiate a search and also to evaluate
* the real obtained timings after a search is performed
*/
public class plasmaSearchProcessing implements Cloneable {
// collection:
// time = time to get a RWI out of RAM cache, assortments and WORDS files
// count = maximum number of RWI-entries that shall be collected
// join
// time = time to perform the join between all collected RWIs
// count = maximum number of entries that shall be joined
// presort:
// time = time to do a sort of the joined URL-records
// count = maximum number of entries that shall be pre-sorted
// urlfetch:
// time = time to fetch the real URLs from the LURL database
// count = maximum number of urls that shall be fetched
// postsort:
// time = time for final sort of URLs
// count = maximum number oof URLs that shall be retrieved during sort
// snippetfetch:
// time = time to fetch snippets for selected URLs
// count = maximum number of snipptes to be fetched
public static final String COLLECTION = "collection";
public static final String JOIN = "join";
public static final String PRESORT = "presort";
public static final String URLFETCH = "urlfetch";
private static final long minimumTargetTime = 100;
private long targetTime;
private int targetCount;
private ArrayList yield;
private long timer;
private plasmaSearchProcessing() {
targetTime = minimumTargetTime;
targetCount = 10;
yield = new ArrayList();
timer = 0;
}
public plasmaSearchProcessing(long time, int count) {
this();
this.targetTime = time;
this.targetCount = count;
}
public static class Entry {
public String process;
public int count;
public long time;
public Entry(String process, int count, long time) {
this.process = process;
this.count = count;
this.time = time;
}
}
public int getTargetCount() {
return this.targetCount;
}
public long getTargetTime() {
return this.targetTime;
}
public void startTimer() {
this.timer = System.currentTimeMillis();
}
public void yield(String s, int count) {
long t = System.currentTimeMillis() - this.timer;
Entry e = new Entry(s, count, t);
yield.add(e);
}
public Iterator events() {
// iteratese Entry-type Objects
return yield.iterator();
}
public int size() {
// returns number of events / Entry-Objects in yield array
return yield.size();
}
public Map[] localSearchContainers(
plasmaSearchQuery query,
plasmaWordIndex wordIndex,
Set urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
startTimer();
Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers(
query.queryHashes,
urlselection,
true,
true);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned
Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0)) ? new HashMap() : wordIndex.getContainers(
query.excludeHashes,
urlselection,
true,
true);
yield(plasmaSearchProcessing.COLLECTION, inclusionContainers.size());
return new Map[]{inclusionContainers, exclusionContainers};
}
public indexContainer localSearchJoinExclude(
Collection includeContainers,
Collection excludeContainers,
int maxDistance) {
// join a search result and return the joincount (number of pages after join)
// since this is a conjunction we return an empty entity if any word is not known
if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);
// join the result
startTimer();
indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance);
if (rcLocal != null) {
indexContainer.excludeContainers(rcLocal, excludeContainers);
}
if (rcLocal == null) rcLocal = plasmaWordIndex.emptyContainer(null, 0);
yield(plasmaSearchProcessing.JOIN, rcLocal.size());
return rcLocal;
}
public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) {
// collect references according to domains
long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
TreeMap doms = new TreeMap();
synchronized (inputContainer) {
Iterator i = inputContainer.entries();
indexRWIEntry iEntry;
String dom, paths;
while (i.hasNext()) {
iEntry = (indexRWIEntry) i.next();
if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
dom = iEntry.urlHash().substring(6);
if ((paths = (String) doms.get(dom)) == null) {
doms.put(dom, iEntry.urlHash().substring(0, 6));
} else {
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
}
if (System.currentTimeMillis() > timeout)
break;
}
}
// construct a result string
serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6);
bb.append('{');
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
bb.append((String) entry.getKey());
bb.append(':');
bb.append((String) entry.getValue());
if (System.currentTimeMillis() > timeout)
break;
if (i.hasNext())
bb.append(',');
}
bb.append('}');
return bb;
}
public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) {
// target is a mapping from url-hashes to a string of peer-hashes
if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) {
//System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
ci = ci.trim(1, ci.length() - 2);
String dom, url, peers;
while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
dom = ci.toString(0, 6);
ci.trim(7);
while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
url = ci.toString(0, 6) + dom;
ci.trim(6);
peers = (String) target.get(url);
if (peers == null) {
target.put(url, peerhash);
} else {
target.put(url, peers + peerhash);
}
//System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
}
if (ci.byteAt(0) == ',') ci.trim(1);
}
}
}
}

@ -44,6 +44,7 @@ import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacyURL;
public final class plasmaSearchRankingProcess {
@ -56,14 +57,14 @@ public final class plasmaSearchRankingProcess {
private plasmaSearchRankingProfile ranking;
private int filteredCount;
private indexRWIEntryOrder order;
private plasmaSearchProcessing process;
private serverProfiling process;
private int maxentries;
private int globalcount;
private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private int[] c; // flag counter
public plasmaSearchRankingProcess(plasmaSearchQuery query, plasmaSearchProcessing process, plasmaSearchRankingProfile ranking, int maxentries) {
public plasmaSearchRankingProcess(plasmaSearchQuery query, serverProfiling process, plasmaSearchRankingProfile ranking, int maxentries) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
this.pageAcc = new TreeMap();
@ -91,7 +92,7 @@ public final class plasmaSearchRankingProcess {
this.order = new indexRWIEntryOrder(ranking);
}
this.order.extend(container);
if (process != null) process.yield("normalizing", container.size());
if (process != null) process.yield(plasmaSearchEvent.NORMALIZING, container.size());
/*
container.setOrdering(o, 0);
@ -115,7 +116,7 @@ public final class plasmaSearchRankingProcess {
if (iEntry.flags().get(j)) {c[j]++;}
}
// kick out entries that are too bad acording to current findings
// kick out entries that are too bad according to current findings
r = new Long(order.cardinal(iEntry));
if ((maxentries >= 0) && (pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue;
@ -154,7 +155,7 @@ public final class plasmaSearchRankingProcess {
if (container.size() > query.neededResults()) remove(true, true);
if (process != null) process.yield(plasmaSearchProcessing.PRESORT, container.size());
if (process != null) process.yield(plasmaSearchEvent.PRESORT, container.size());
}
public class rIterator implements Iterator {

@ -389,6 +389,24 @@ public final class plasmaWordIndex implements indexRI {
return containers;
}
public Map[] localSearchContainers(plasmaSearchQuery query, Set urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : getContainers(
query.queryHashes,
urlselection,
true,
true);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned
Map exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap() : getContainers(
query.excludeHashes,
urlselection,
true,
true);
return new Map[]{inclusionContainers, exclusionContainers};
}
public Finding retrieveURLs(plasmaSearchQuery query, boolean loadurl, int sortorder, plasmaSearchRankingProfile ranking) {
// search for a word hash and generate a list of url links
// sortorder: 0 = hash, 1 = url, 2 = ranking

@ -0,0 +1,93 @@
// serverProfiling.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 17.11.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.server;
import java.util.ArrayList;
import java.util.Iterator;
public class serverProfiling implements Cloneable {
private static final long minimumTargetTime = 100;
private long targetTime;
private int targetCount;
private ArrayList yield;
private long timer;
private serverProfiling() {
targetTime = minimumTargetTime;
targetCount = 10;
yield = new ArrayList();
timer = 0;
}
public serverProfiling(long time, int count) {
this();
this.targetTime = time;
this.targetCount = count;
}
public static class Entry {
public String process;
public int count;
public long time;
public Entry(String process, int count, long time) {
this.process = process;
this.count = count;
this.time = time;
}
}
public int getTargetCount() {
return this.targetCount;
}
public long getTargetTime() {
return this.targetTime;
}
public void startTimer() {
this.timer = System.currentTimeMillis();
}
public void yield(String s, int count) {
long t = System.currentTimeMillis() - this.timer;
Entry e = new Entry(s, count, t);
yield.add(e);
}
public Iterator events() {
// iteratese Entry-type Objects
return yield.iterator();
}
public int size() {
// returns number of events / Entry-Objects in yield array
return yield.size();
}
}

@ -60,7 +60,6 @@ import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
@ -549,7 +548,7 @@ public final class yacyClient {
if (singleAbstract == null) singleAbstract = new TreeMap();
ci = new serverByteBuffer(((String) entry.getValue()).getBytes());
//System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString());
plasmaSearchProcessing.decompressIndex(singleAbstract, ci, target.hash);
indexContainer.decompressIndex(singleAbstract, ci, target.hash);
abstractCache.put(wordhash, singleAbstract);
}
}

@ -670,7 +670,7 @@ public class yacySeed {
return type.equals(yacySeed.PEERTYPE_SENIOR) || type.equals(yacySeed.PEERTYPE_PRINCIPAL);
}
public static final long minDHTNumber = kelondroBase64Order.enhancedCoder.cardinal("AAAAAAAAAAAA".getBytes());
public static final long minDHTNumber = kelondroBase64Order.enhancedCoder.cardinal(kelondroBase64Order.zero(12));
public static final long maxDHTDistance = Long.MAX_VALUE;
public double dhtPosition() {

Loading…
Cancel
Save