You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
256 lines
9.2 KiB
256 lines
9.2 KiB
// plasmaSearchProcessing.java
|
|
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 17.10.2005 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
|
// $LastChangedRevision: 1986 $
|
|
// $LastChangedBy: orbiter $
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.plasma;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.Collection;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.TreeMap;
|
|
|
|
import de.anomic.index.indexContainer;
|
|
import de.anomic.index.indexRWIEntry;
|
|
import de.anomic.server.serverByteBuffer;
|
|
|
|
/**
|
|
*
|
|
* This class provides search processes and keeps a timing record of the processes
|
|
* It shall be used to initiate a search and also to evaluate
|
|
* the real obtained timings after a search is performed
|
|
*/
|
|
|
|
public class plasmaSearchProcessing implements Cloneable {
|
|
|
|
// collection:
|
|
// time = time to get a RWI out of RAM cache, assortments and WORDS files
|
|
// count = maximum number of RWI-entries that shall be collected
|
|
|
|
// join
|
|
// time = time to perform the join between all collected RWIs
|
|
// count = maximum number of entries that shall be joined
|
|
|
|
// presort:
|
|
// time = time to do a sort of the joined URL-records
|
|
// count = maximum number of entries that shall be pre-sorted
|
|
|
|
// urlfetch:
|
|
// time = time to fetch the real URLs from the LURL database
|
|
// count = maximum number of urls that shall be fetched
|
|
|
|
// postsort:
|
|
// time = time for final sort of URLs
|
|
// count = maximum number oof URLs that shall be retrieved during sort
|
|
|
|
// snippetfetch:
|
|
// time = time to fetch snippets for selected URLs
|
|
// count = maximum number of snipptes to be fetched
|
|
|
|
public static final String COLLECTION = "collection";
|
|
public static final String JOIN = "join";
|
|
public static final String PRESORT = "presort";
|
|
public static final String URLFETCH = "urlfetch";
|
|
|
|
private static final long minimumTargetTime = 100;
|
|
|
|
private long targetTime;
|
|
private int targetCount;
|
|
private ArrayList yield;
|
|
private long timer;
|
|
|
|
private plasmaSearchProcessing() {
|
|
targetTime = minimumTargetTime;
|
|
targetCount = 10;
|
|
yield = new ArrayList();
|
|
timer = 0;
|
|
}
|
|
|
|
public plasmaSearchProcessing(long time, int count) {
|
|
this();
|
|
this.targetTime = time;
|
|
this.targetCount = count;
|
|
}
|
|
|
|
public static class Entry {
|
|
public String process;
|
|
public int count;
|
|
public long time;
|
|
public Entry(String process, int count, long time) {
|
|
this.process = process;
|
|
this.count = count;
|
|
this.time = time;
|
|
}
|
|
}
|
|
|
|
public int getTargetCount() {
|
|
return this.targetCount;
|
|
}
|
|
|
|
public long getTargetTime() {
|
|
return this.targetTime;
|
|
}
|
|
|
|
public void startTimer() {
|
|
this.timer = System.currentTimeMillis();
|
|
}
|
|
|
|
public void yield(String s, int count) {
|
|
long t = System.currentTimeMillis() - this.timer;
|
|
Entry e = new Entry(s, count, t);
|
|
yield.add(e);
|
|
}
|
|
|
|
public Iterator events() {
|
|
// iteratese Entry-type Objects
|
|
return yield.iterator();
|
|
}
|
|
|
|
public int size() {
|
|
// returns number of events / Entry-Objects in yield array
|
|
return yield.size();
|
|
}
|
|
|
|
// collection
|
|
public Map[] localSearchContainers(
|
|
plasmaSearchQuery query,
|
|
plasmaWordIndex wordIndex,
|
|
Set urlselection) {
|
|
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
|
|
|
|
// retrieve entities that belong to the hashes
|
|
startTimer();
|
|
Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers(
|
|
query.queryHashes,
|
|
urlselection,
|
|
true,
|
|
true);
|
|
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned
|
|
Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0)) ? new HashMap() : wordIndex.getContainers(
|
|
query.excludeHashes,
|
|
urlselection,
|
|
true,
|
|
true);
|
|
yield(plasmaSearchProcessing.COLLECTION, inclusionContainers.size());
|
|
|
|
return new Map[]{inclusionContainers, exclusionContainers};
|
|
}
|
|
|
|
// join
|
|
public indexContainer localSearchJoinExclude(
|
|
Collection includeContainers,
|
|
Collection excludeContainers,
|
|
int maxDistance) {
|
|
// join a search result and return the joincount (number of pages after join)
|
|
|
|
// since this is a conjunction we return an empty entity if any word is not known
|
|
if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);
|
|
|
|
// join the result
|
|
startTimer();
|
|
indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance);
|
|
if (rcLocal != null) {
|
|
indexContainer.excludeContainers(rcLocal, excludeContainers);
|
|
}
|
|
if (rcLocal == null) rcLocal = plasmaWordIndex.emptyContainer(null, 0);
|
|
yield(plasmaSearchProcessing.JOIN, rcLocal.size());
|
|
|
|
return rcLocal;
|
|
}
|
|
|
|
|
|
|
|
public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) {
|
|
// collect references according to domains
|
|
long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
|
|
TreeMap doms = new TreeMap();
|
|
synchronized (inputContainer) {
|
|
Iterator i = inputContainer.entries();
|
|
indexRWIEntry iEntry;
|
|
String dom, paths;
|
|
while (i.hasNext()) {
|
|
iEntry = (indexRWIEntry) i.next();
|
|
if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
|
|
dom = iEntry.urlHash().substring(6);
|
|
if ((paths = (String) doms.get(dom)) == null) {
|
|
doms.put(dom, iEntry.urlHash().substring(0, 6));
|
|
} else {
|
|
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
|
|
}
|
|
if (System.currentTimeMillis() > timeout)
|
|
break;
|
|
}
|
|
}
|
|
// construct a result string
|
|
serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6);
|
|
bb.append('{');
|
|
Iterator i = doms.entrySet().iterator();
|
|
Map.Entry entry;
|
|
while (i.hasNext()) {
|
|
entry = (Map.Entry) i.next();
|
|
bb.append((String) entry.getKey());
|
|
bb.append(':');
|
|
bb.append((String) entry.getValue());
|
|
if (System.currentTimeMillis() > timeout)
|
|
break;
|
|
if (i.hasNext())
|
|
bb.append(',');
|
|
}
|
|
bb.append('}');
|
|
return bb;
|
|
}
|
|
|
|
public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) {
|
|
// target is a mapping from url-hashes to a string of peer-hashes
|
|
if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) {
|
|
//System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
|
|
ci = ci.trim(1, ci.length() - 2);
|
|
String dom, url, peers;
|
|
while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
|
|
assert ci.length() >= 6 : "ci.length() = " + ci.length();
|
|
dom = ci.toString(0, 6);
|
|
ci.trim(7);
|
|
while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
|
|
assert ci.length() >= 6 : "ci.length() = " + ci.length();
|
|
url = ci.toString(0, 6) + dom;
|
|
ci.trim(6);
|
|
peers = (String) target.get(url);
|
|
if (peers == null) {
|
|
target.put(url, peerhash);
|
|
} else {
|
|
target.put(url, peers + peerhash);
|
|
}
|
|
//System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
|
|
}
|
|
if (ci.byteAt(0) == ',') ci.trim(1);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
}
|