more refactoring of search processes; also some small speed enhancements

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4058 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent b3c830271c
commit ae86d010bb

@ -39,15 +39,11 @@ import de.anomic.kelondro.kelondroBitfield;
import de.anomic.index.indexContainer;
import de.anomic.net.natLib;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchPreOrder;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchPostOrder;
import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSearchResultAccumulator;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverCore;
@ -88,7 +84,7 @@ public final class search {
final int partitions = post.getInt("partitions", 30);
String profile = post.get("profile", ""); // remote profile hand-over
if (profile.length() > 0) profile = crypt.simpleDecode(profile, null);
final boolean includesnippet = post.get("includesnippet", "false").equals("true");
//final boolean includesnippet = post.get("includesnippet", "false").equals("true");
final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______"));
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -130,9 +126,9 @@ public final class search {
StringBuffer indexabstract = new StringBuffer();
int indexabstractContainercount = 0;
int joincount = 0;
plasmaSearchPostOrder acc = null;
plasmaSearchQuery squery = null;
//plasmaSearchEvent theSearch = null;
plasmaSearchResultAccumulator accu = null;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
squery = new plasmaSearchQuery(abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, plasmaSearchQuery.catchall_constraint);
@ -223,15 +219,11 @@ public final class search {
if (localResults == null) {
joincount = 0;
prop.put("joincount", 0);
acc = null;
} else {
joincount = localResults.size();
prop.putASIS("joincount", Integer.toString(joincount));
plasmaSearchPreOrder pre = localProcess.preSort(squery, rankingProfile, localResults);
acc = localProcess.urlFetch(squery, rankingProfile, sb.wordIndex, pre);
acc.localContributions = (localResults == null) ? 0 : localResults.size();
localProcess.postSort(true, acc);
localProcess.applyFilter(acc);
plasmaSearchPreOrder pre = new plasmaSearchPreOrder(squery, localProcess, rankingProfile, localResults);
accu = new plasmaSearchResultAccumulator(squery, localProcess, rankingProfile, pre, sb.wordIndex, plasmaSwitchboard.blueList, false);
}
// generate compressed index for maxcounthash
@ -262,7 +254,7 @@ public final class search {
// prepare search statistics
Long trackerHandle = new Long(System.currentTimeMillis());
String client = (String) header.get("CLIENTIP");
HashMap searchProfile = plasmaSearchEvent.resultProfile(squery, joincount, System.currentTimeMillis() - timestamp);
HashMap searchProfile = squery.resultProfile((accu == null) ? 0 : accu.resultCount(), System.currentTimeMillis() - timestamp);
searchProfile.put("host", client);
yacySeed remotepeer = yacyCore.seedDB.lookupByIP(natLib.getInetAddress(client), true, false, false);
searchProfile.put("peername", (remotepeer == null) ? "unknown" : remotepeer.getName());
@ -275,7 +267,7 @@ public final class search {
sb.remoteSearchTracker.put(client, handles);
// prepare result
if ((joincount == 0) || (acc == null)) {
if ((joincount == 0) || (accu == null)) {
// no results
prop.putASIS("links", "");
@ -284,33 +276,21 @@ public final class search {
} else {
// result is a List of urlEntry elements
int i = 0;
StringBuffer links = new StringBuffer();
String resource = null;
indexURLEntry urlentry;
plasmaSnippetCache.TextSnippet snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = (indexURLEntry) acc.nextElement();
if (includesnippet) {
snippet = plasmaSnippetCache.retrieveTextSnippet(urlentry.comp().url(), squery.queryHashes, false, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), 260, 1000);
} else {
snippet = null;
}
if ((snippet != null) && (snippet.exists())) {
resource = urlentry.toString(snippet.getLineRaw());
} else {
resource = urlentry.toString();
}
plasmaSearchResultAccumulator.Entry entry;
for (int i = 0; i < accu.resultCount(); i++) {
entry = accu.resultEntry(i);
resource = entry.resource();
if (resource != null) {
links.append("resource").append(i).append('=').append(resource).append(serverCore.crlfString);
i++;
}
}
prop.putASIS("links", new String(links));
prop.putASIS("linkcount", Integer.toString(i));
prop.put("linkcount", accu.resultCount());
// prepare reference hints
Object[] ws = acc.getReferences(16);
Object[] ws = accu.references();
StringBuffer refstr = new StringBuffer();
for (int j = 0; j < ws.length; j++)
refstr.append(",").append((String) ws[j]);

@ -53,7 +53,6 @@ import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.regex.PatternSyntaxException;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpHeader;
@ -66,7 +65,6 @@ import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchImages;
import de.anomic.plasma.plasmaSearchPostOrder;
import de.anomic.plasma.plasmaSearchPreOrder;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
@ -272,7 +270,7 @@ public class yacysearch {
// do the search
TreeSet queryHashes = plasmaCondenser.words2hashes(query[0]);
plasmaSearchQuery thisSearch = new plasmaSearchQuery(
plasmaSearchQuery theQuery = new plasmaSearchQuery(
querystring,
queryHashes,
plasmaCondenser.words2hashes(query[1]),
@ -288,57 +286,51 @@ public class yacysearch {
20,
constraint);
plasmaSearchRankingProfile ranking = (sb.getConfig("rankingProfile", "").length() == 0) ? new plasmaSearchRankingProfile(contentdomString) : new plasmaSearchRankingProfile("", crypt.simpleDecode(sb.getConfig("rankingProfile", ""), null));
plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * thisSearch.maximumTime / 10, thisSearch.wantedResults);
plasmaSearchProcessing remoteTiming = new plasmaSearchProcessing(6 * thisSearch.maximumTime / 10, thisSearch.wantedResults);
plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * theQuery.maximumTime / 10, theQuery.wantedResults);
plasmaSearchProcessing remoteTiming = new plasmaSearchProcessing(6 * theQuery.maximumTime / 10, theQuery.wantedResults);
//**
//searchFromLocal(thisSearch, ranking, localTiming, remoteTiming, true, (String) header.get("CLIENTIP"))
String client = (String) header.get("CLIENTIP"); // the search client who initiated the search
// tell all threads to do nothing for a specific time
sb.intermissionAllThreads(2 * thisSearch.maximumTime);
sb.intermissionAllThreads(2 * theQuery.maximumTime);
// filter out words that appear in bluelist
thisSearch.filterOut(plasmaSwitchboard.blueList);
theQuery.filterOut(plasmaSwitchboard.blueList);
// log
serverLog.logInfo("LOCAL_SEARCH", "INIT WORD SEARCH: " + thisSearch.queryString + ":" + thisSearch.queryHashes + " - " + thisSearch.wantedResults + " links, " + (thisSearch.maximumTime / 1000) + " seconds");
serverLog.logInfo("LOCAL_SEARCH", "INIT WORD SEARCH: " + theQuery.queryString + ":" + theQuery.queryHashes + " - " + theQuery.wantedResults + " links, " + (theQuery.maximumTime / 1000) + " seconds");
long timestamp = System.currentTimeMillis();
// create a new search event
String wrongregex = null;
plasmaSearchEvent theSearch = new plasmaSearchEvent(thisSearch, ranking, localTiming, remoteTiming, true, sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null);
plasmaSearchPostOrder acc;
try{
acc = theSearch.search();
} catch(PatternSyntaxException e){
wrongregex = e.getPattern();
acc = new plasmaSearchPostOrder(thisSearch, ranking);
}
plasmaSearchEvent theSearch = new plasmaSearchEvent(theQuery, ranking, localTiming, remoteTiming, sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null);
plasmaSearchPreOrder preorder = theSearch.search();
// fetch snippets
serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
plasmaSearchResultAccumulator accu = new plasmaSearchResultAccumulator(theSearch, sb.wordIndex, plasmaSwitchboard.blueList);
plasmaSearchResultAccumulator accu = new plasmaSearchResultAccumulator(theQuery, localTiming, ranking, preorder, sb.wordIndex, plasmaSwitchboard.blueList, true);
serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
// calc some more cross-reference
long remainingTime = thisSearch.maximumTime - (System.currentTimeMillis() - timestamp);
long remainingTime = theQuery.maximumTime - (System.currentTimeMillis() - timestamp);
if (remainingTime < 0) remainingTime = 1000;
//Object[] ws = acc.getReferences(16);
serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER XREF PREPARATION: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
// log
serverLog.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " + thisSearch.queryString + " - " +
acc.getTotalCount() + " links found, " +
acc.filteredResults + " links filtered, " +
acc.sizeOrdered() + " links ordered, " +
serverLog.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " + theQuery.queryString + " - " +
(theSearch.getLocalCount() + theSearch.getGlobalCount()) + " links found, " +
preorder.filteredCount() + " links filtered, " +
accu.resultCount() + " links ordered, " +
((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
// prepare search statistics
Long trackerHandle = new Long(System.currentTimeMillis());
HashMap searchProfile = theSearch.resultProfile();
searchProfile.put("querystring", thisSearch.queryString);
HashMap searchProfile = theQuery.resultProfile(accu.resultCount(), System.currentTimeMillis() - timestamp);
searchProfile.put("querystring", theQuery.queryString);
searchProfile.put("time", trackerHandle);
searchProfile.put("host", client);
searchProfile.put("offset", new Integer(0));
@ -354,18 +346,18 @@ public class yacysearch {
prop=new serverObjects();
//prop.put("references", 0);
URL wordURL=null;
prop.put("num-results_totalcount", acc.getTotalCount());
prop.put("num-results_filteredcount", acc.filteredResults);
prop.put("num-results_orderedcount", acc.sizeOrdered());
prop.put("num-results_globalresults", acc.globalContributions);
prop.put("num-results_totalcount", theSearch.getLocalCount() + theSearch.getGlobalCount());
prop.put("num-results_filteredcount", preorder.filteredCount());
prop.put("num-results_orderedcount", accu.resultCount());
prop.put("num-results_globalresults", theSearch.getGlobalCount());
prop.put("num-results_linkcount", 0);
prop.put("type_results", 0);
for (int i = 0; i < accu.resultCount(); i++) {
plasmaSearchResultAccumulator.Entry result = accu.resultEntry(i);
prop.put("type_results_" + i + "_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? 1 : 0);
prop.put("type_results_" + i + "_authorized_recommend_deletelink", "/yacysearch.html?search=" + thisSearch.queryString + "&Enter=Search&count=" + thisSearch.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_authorized_recommend_recommendlink", "/yacysearch.html?search=" + thisSearch.queryString + "&Enter=Search&count=" + thisSearch.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_authorized_urlhash", result.hash());
prop.put("type_results_" + i + "_description", result.title());
prop.put("type_results_" + i + "_url", result.urlstring());
@ -393,7 +385,7 @@ public class yacysearch {
// adding snippet if available
if (result.hasSnippet()) {
prop.put("type_results_" + i + "_snippet", 1);
prop.putASIS("type_results_" + i + "_snippet_text", result.textSnippet().getLineMarked(theSearch.getQuery().queryHashes));//FIXME: the ASIS should not be needed, if there is no html in .java
prop.putASIS("type_results_" + i + "_snippet_text", result.textSnippet().getLineMarked(theQuery.queryHashes));//FIXME: the ASIS should not be needed, if there is no html in .java
} else {
if (post.containsKey("fetchSnippet")) {
// snippet fetch timeout
@ -535,7 +527,7 @@ public class yacysearch {
}
}
prop.put("type", (thisSearch.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) ? 0 : ((thisSearch.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 2 : 1));
prop.put("type", (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) ? 0 : ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 2 : 1));
if (prop.getInt("type", 0) == 1) prop.put("type_mediatype", contentdomString);
prop.put("input_cat", "href");
prop.put("input_depth", "0");

@ -26,7 +26,6 @@
package de.anomic.plasma;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
@ -47,17 +46,14 @@ public final class plasmaSearchEvent {
private indexContainer rcContainers; // cache for results
private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
private plasmaSearchProcessing profileLocal, profileGlobal;
private boolean postsort;
private yacySearch[] primarySearchThreads, secondarySearchThreads;
private long searchtime;
private int searchcount;
private TreeMap preselectedPeerHashes;
private int localcount, globalcount;
public plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaSearchProcessing localTiming,
plasmaSearchProcessing remoteTiming,
boolean postsort,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes) {
this.wordIndex = wordIndex;
@ -67,18 +63,21 @@ public final class plasmaSearchEvent {
this.rcAbstracts = (query.queryHashes.size() > 1) ? new TreeMap() : null; // generate abstracts only for combined searches
this.profileLocal = localTiming;
this.profileGlobal = remoteTiming;
this.postsort = postsort;
this.primarySearchThreads = null;
this.secondarySearchThreads = null;
this.searchtime = -1;
this.searchcount = -1;
this.preselectedPeerHashes = preselectedPeerHashes;
this.localcount = 0;
this.globalcount = 0;
}
public plasmaSearchQuery getQuery() {
return query;
}
public plasmaSearchRankingProfile getRanking() {
return ranking;
}
public plasmaSearchProcessing getLocalTiming() {
return profileLocal;
}
@ -90,28 +89,19 @@ public final class plasmaSearchEvent {
return secondarySearchThreads;
}
public HashMap resultProfile() {
// generate statistics about search: query, time, etc
return resultProfile(this.query, this.searchcount, this.searchtime);
public int getLocalCount() {
return this.localcount;
}
public static HashMap resultProfile(plasmaSearchQuery query, int searchcount, long searchtime) {
// generate statistics about search: query, time, etc
HashMap r = new HashMap();
r.put("queryhashes", query.queryHashes);
r.put("querystring", query.queryString);
r.put("querycount", new Integer(query.wantedResults));
r.put("querytime", new Long(query.maximumTime));
r.put("resultcount", new Integer(searchcount));
r.put("resulttime", new Long(searchtime));
return r;
public int getGlobalCount() {
return this.globalcount;
}
public plasmaSearchPostOrder search() {
public plasmaSearchPreOrder search() {
// combine all threads
long start = System.currentTimeMillis();
plasmaSearchPostOrder result;
plasmaSearchPreOrder pre;
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds
@ -195,8 +185,6 @@ public final class plasmaSearchEvent {
((secondarySearchThreads == null) || (yacySearch.remainingWaiting(secondarySearchThreads) == 0))) break; // all threads have finished
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
int globalContributions = rcContainers.size();
// finished searching
serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
@ -207,16 +195,9 @@ public final class plasmaSearchEvent {
searchResult.addAllUnique(rcContainers);
searchResult.sort();
searchResult.uniq(1000);
plasmaSearchPreOrder pre = profileLocal.preSort(query, ranking, searchResult);
result = profileLocal.urlFetch(query, ranking, wordIndex, pre);
result.localContributions = (rcLocal == null) ? 0 : rcLocal.size();
profileLocal.postSort(postsort, result);
profileLocal.applyFilter(result);
if (result != null) {
result.globalContributions = globalContributions;
}
localcount = rcLocal.size();
globalcount = rcContainers.size();
pre = new plasmaSearchPreOrder(query, profileLocal, ranking, searchResult);
} else {
Map[] searchContainerMaps = profileLocal.localSearchContainers(query, wordIndex, null);
@ -230,13 +211,8 @@ public final class plasmaSearchEvent {
0 :
profileLocal.getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) * query.queryHashes.size() / (query.queryHashes.size() + query.excludeHashes.size()),
query.maxDistance);
plasmaSearchPreOrder pre = profileLocal.preSort(query, ranking, rcLocal);
result = profileLocal.urlFetch(query, ranking, wordIndex, pre);
result.localContributions = (rcLocal == null) ? 0 : rcLocal.size();
profileLocal.postSort(postsort, result);
profileLocal.applyFilter(result);
result.globalContributions = 0;
this.localcount = rcLocal.size();
pre = new plasmaSearchPreOrder(query, profileLocal, ranking, rcLocal);
}
// log the event
@ -244,11 +220,9 @@ public final class plasmaSearchEvent {
// prepare values for statistics
lastEvent = this;
this.searchtime = System.currentTimeMillis() - start;
this.searchcount = result.filteredResults;
// return search result
return result;
return pre;
}
private void prepareSecondarySearch() {

@ -65,9 +65,6 @@ public final class plasmaSearchPostOrder {
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking;
public int globalContributions;
public int localContributions;
public int filteredResults;
public plasmaSearchPostOrder(plasmaSearchQuery query, plasmaSearchRankingProfile ranking) {
this.pageAcc = new TreeMap();
@ -75,9 +72,6 @@ public final class plasmaSearchPostOrder {
this.results = new ArrayList();
this.query = query;
this.ranking = ranking;
this.globalContributions = 0;
this.localContributions = 0;
this.filteredResults = 0;
}
public plasmaSearchPostOrder cloneSmart() {
@ -101,10 +95,6 @@ public final class plasmaSearchPostOrder {
return pageAcc.size() > 0;
}
public int getTotalCount() {
return this.localContributions + this.globalContributions;
}
public indexURLEntry nextElement() {
Object top = pageAcc.firstKey();
//System.out.println("postorder-key: " + ((String) top));

@ -76,18 +76,27 @@ public final class plasmaSearchPreOrder {
this.ranking = null;
}
public plasmaSearchPreOrder(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, indexContainer container, long maxTime) {
public plasmaSearchPreOrder(plasmaSearchQuery query, plasmaSearchProcessing process, plasmaSearchRankingProfile ranking, indexContainer container) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
this.query = query;
this.ranking = ranking;
assert (container != null);
long maxTime = process.getTargetTime(plasmaSearchProcessing.PROCESS_PRESORT);
process.startTimer();
// set limit time for interruption
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
indexRWIEntry iEntry;
// first pass: find min/max to obtain limits for normalization
Iterator i = container.entries();
int count = 0;
this.entryMin = null;
this.entryMax = null;
indexRWIEntry iEntry;
while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break;
iEntry = (indexRWIEntry) i.next();
@ -101,6 +110,7 @@ public final class plasmaSearchPreOrder {
this.pageAcc = new TreeMap();
TreeSet searchWords = plasmaSearchQuery.cleanQuery(query.queryString)[0];
for (int j = 0; j < count; j++) {
if (System.currentTimeMillis() > limitTime) break;
iEntry = (indexRWIEntry) i.next();
if (iEntry.urlHash().length() != container.row().width(container.row().primaryKey())) continue;
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint
@ -113,13 +123,18 @@ public final class plasmaSearchPreOrder {
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry, this.entryMin, this.entryMax, searchWords), 16) + iEntry.urlHash(), iEntry);
}
this.filteredCount = pageAcc.size();
if (container.size() > query.wantedResults) remove(true, true);
process.setYieldTime(plasmaSearchProcessing.PROCESS_PRESORT);
process.setYieldCount(plasmaSearchProcessing.PROCESS_PRESORT, container.size());
}
public int filteredCount() {
return this.filteredCount;
}
public void remove(boolean rootDomExt, boolean doubleDom) {
private void remove(boolean rootDomExt, boolean doubleDom) {
// this removes all refererences to urls that are extended paths of existing 'RootDom'-urls
if (pageAcc.size() <= query.wantedResults) return;
HashSet rootDoms = new HashSet();

@ -28,15 +28,10 @@ package de.anomic.plasma;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroException;
import de.anomic.server.logging.serverLog;
/**
*
@ -331,128 +326,4 @@ public class plasmaSearchProcessing implements Cloneable {
return rcLocal;
}
// presort
public plasmaSearchPreOrder preSort(
plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
indexContainer resultIndex) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
assert (resultIndex != null);
long preorderTime = getTargetTime(plasmaSearchProcessing.PROCESS_PRESORT);
startTimer();
long pst = System.currentTimeMillis();
resultIndex.sort();
resultIndex.uniq(1000);
preorderTime = preorderTime - (System.currentTimeMillis() - pst);
if (preorderTime < 0) preorderTime = 200;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, resultIndex, preorderTime);
if (resultIndex.size() > query.wantedResults) preorder.remove(true, true);
setYieldTime(plasmaSearchProcessing.PROCESS_PRESORT);
setYieldCount(plasmaSearchProcessing.PROCESS_PRESORT, resultIndex.size());
return preorder;
}
// urlfetch
public plasmaSearchPostOrder urlFetch(
plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaWordIndex wordIndex,
plasmaSearchPreOrder preorder) {
// start url-fetch
long postorderTime = getTargetTime(plasmaSearchProcessing.PROCESS_POSTSORT);
//System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime);
long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime);
startTimer();
plasmaSearchPostOrder acc = new plasmaSearchPostOrder(query, ranking);
indexRWIEntry entry;
indexURLEntry page;
Long preranking;
Object[] preorderEntry;
indexURLEntry.Components comp;
String pagetitle, pageurl, pageauthor;
int minEntries = getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT);
try {
ordering: while (preorder.hasNext()) {
if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= 5 * minEntries)) break;
preorderEntry = preorder.next();
entry = (indexRWIEntry) preorderEntry[0];
// load only urls if there was not yet a root url of that hash
preranking = (Long) preorderEntry[1];
// find the url entry
page = wordIndex.loadedURL.load(entry.urlHash(), entry);
if (page != null) {
comp = page.comp();
pagetitle = comp.title().toLowerCase();
if (comp.url() == null) continue ordering; // rare case where the url is corrupted
pageurl = comp.url().toString().toLowerCase();
pageauthor = comp.author().toLowerCase();
// check exclusion
if (plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) continue ordering;
if (plasmaSearchQuery.matches(pageurl, query.excludeHashes)) continue ordering;
if (plasmaSearchQuery.matches(pageauthor, query.excludeHashes)) continue ordering;
// check url mask
if (!(pageurl.matches(query.urlMask))) continue ordering;
// check constraints
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) &&
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(comp.title().startsWith("Index of")))) {
serverLog.logFine("PLASMA", "filtered out " + comp.url().toString());
// filter out bad results
Iterator wi = query.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash());
} else if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addPage(page, preranking);
else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addPage(page, preranking);
else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addPage(page, preranking);
else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addPage(page, preranking);
} else {
acc.addPage(page, preranking);
}
}
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
}
setYieldTime(plasmaSearchProcessing.PROCESS_URLFETCH);
setYieldCount(plasmaSearchProcessing.PROCESS_URLFETCH, acc.sizeFetched());
acc.filteredResults = preorder.filteredCount();
return acc;
}
//acc.localContributions = (resultIndex == null) ? 0 : resultIndex.size();
// postsort
public void postSort(
boolean postsort,
plasmaSearchPostOrder acc) {
// start postsorting
startTimer();
acc.sortPages(postsort);
setYieldTime(plasmaSearchProcessing.PROCESS_POSTSORT);
setYieldCount(plasmaSearchProcessing.PROCESS_POSTSORT, acc.sizeOrdered());
}
// filter
public void applyFilter(
plasmaSearchPostOrder acc) {
// apply filter
startTimer();
acc.removeRedundant();
setYieldTime(plasmaSearchProcessing.PROCESS_FILTER);
setYieldCount(plasmaSearchProcessing.PROCESS_FILTER, acc.sizeOrdered());
}
}

@ -42,6 +42,7 @@
package de.anomic.plasma;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
@ -215,4 +216,16 @@ public final class plasmaSearchQuery {
return new String(sb);
}
public HashMap resultProfile(int searchcount, long searchtime) {
// generate statistics about search: query, time, etc
HashMap r = new HashMap();
r.put("queryhashes", queryHashes);
r.put("querystring", queryString);
r.put("querycount", new Integer(wantedResults));
r.put("querytime", new Long(maximumTime));
r.put("resultcount", new Integer(searchcount));
r.put("resulttime", new Long(searchtime));
return r;
}
}

@ -30,29 +30,117 @@ package de.anomic.plasma;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroException;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public class plasmaSearchResultAccumulator {
private ArrayList hits;
private Object[] references;
public plasmaSearchResultAccumulator(plasmaSearchEvent theSearch, plasmaWordIndex wordIndex, TreeSet blueList) {
public plasmaSearchResultAccumulator(
plasmaSearchQuery theQuery,
plasmaSearchProcessing process,
plasmaSearchRankingProfile ranking,
plasmaSearchPreOrder pre,
plasmaWordIndex wordIndex,
TreeSet blueList,
boolean overfetch) {
hits = new ArrayList();
plasmaSearchPostOrder acc = theSearch.search();
// fetch urls
// start url-fetch
long postorderTime = process.getTargetTime(plasmaSearchProcessing.PROCESS_POSTSORT);
//System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime);
long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime);
process.startTimer();
plasmaSearchPostOrder acc = new plasmaSearchPostOrder(theQuery, ranking);
indexRWIEntry rwientry;
indexURLEntry page;
Long preranking;
Object[] preorderEntry;
indexURLEntry.Components comp;
String pagetitle, pageurl, pageauthor;
int minEntries = process.getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT);
try {
ordering: while (pre.hasNext()) {
if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= ((overfetch) ? 4 : 1) * minEntries)) break;
preorderEntry = pre.next();
rwientry = (indexRWIEntry) preorderEntry[0];
// load only urls if there was not yet a root url of that hash
preranking = (Long) preorderEntry[1];
// find the url entry
page = wordIndex.loadedURL.load(rwientry.urlHash(), rwientry);
if (page != null) {
comp = page.comp();
pagetitle = comp.title().toLowerCase();
if (comp.url() == null) continue ordering; // rare case where the url is corrupted
pageurl = comp.url().toString().toLowerCase();
pageauthor = comp.author().toLowerCase();
// check exclusion
if (plasmaSearchQuery.matches(pagetitle, theQuery.excludeHashes)) continue ordering;
if (plasmaSearchQuery.matches(pageurl, theQuery.excludeHashes)) continue ordering;
if (plasmaSearchQuery.matches(pageauthor, theQuery.excludeHashes)) continue ordering;
// check url mask
if (!(pageurl.matches(theQuery.urlMask))) continue ordering;
// check constraints
if ((!(theQuery.constraint.equals(plasmaSearchQuery.catchall_constraint))) &&
(theQuery.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(comp.title().startsWith("Index of")))) {
serverLog.logFine("PLASMA", "filtered out " + comp.url().toString());
// filter out bad results
Iterator wi = theQuery.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash());
} else if (theQuery.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addPage(page, preranking);
else if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addPage(page, preranking);
else if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addPage(page, preranking);
else if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addPage(page, preranking);
} else {
acc.addPage(page, preranking);
}
}
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
}
process.setYieldTime(plasmaSearchProcessing.PROCESS_URLFETCH);
process.setYieldCount(plasmaSearchProcessing.PROCESS_URLFETCH, acc.sizeFetched());
// start postsorting
process.startTimer();
acc.sortPages(true);
process.setYieldTime(plasmaSearchProcessing.PROCESS_POSTSORT);
process.setYieldCount(plasmaSearchProcessing.PROCESS_POSTSORT, acc.sizeOrdered());
// apply filter
process.startTimer();
acc.removeRedundant();
process.setYieldTime(plasmaSearchProcessing.PROCESS_FILTER);
process.setYieldCount(plasmaSearchProcessing.PROCESS_FILTER, acc.sizeOrdered());
// generate references
references = acc.getReferences(16);
// generate Result.Entry objects and optionally fetch snippets
int i = 0;
Entry entry;
boolean includeSnippets = false;
while ((acc.hasMoreElements()) && (i < theSearch.getQuery().wantedResults)) {
while ((acc.hasMoreElements()) && (i < theQuery.wantedResults)) {
try {
entry = new Entry(acc.nextElement(), wordIndex);
} catch (RuntimeException e) {
@ -69,7 +157,7 @@ public class plasmaSearchResultAccumulator {
*/
if (includeSnippets) {
entry.setSnippet(plasmaSnippetCache.retrieveTextSnippet(
entry.url(), theSearch.getQuery().queryHashes, false,
entry.url(), theQuery.queryHashes, false,
entry.flags().get(plasmaCondenser.flag_cat_indexof), 260,
1000));
// snippet =
@ -96,6 +184,14 @@ public class plasmaSearchResultAccumulator {
*/
}
// filter
public void applyFilter(
plasmaSearchPostOrder acc) {
}
public int resultCount() {
return hits.size();
@ -105,6 +201,10 @@ public class plasmaSearchResultAccumulator {
return (Entry) hits.get(i);
}
public Object[] references() {
return this.references;
}
public static class Entry {
private indexURLEntry urlentry;
private indexURLEntry.Components urlcomps; // buffer for components
@ -186,6 +286,14 @@ public class plasmaSearchResultAccumulator {
public plasmaSnippetCache.TextSnippet textSnippet() {
return null;
}
public String resource() {
// generate transport resource
if ((snippet != null) && (snippet.exists())) {
return urlentry.toString(snippet.getLineRaw());
} else {
return urlentry.toString();
}
}
}
}

Loading…
Cancel
Save