steps to enhance remote search performance:

- added a file size limitation, that disallows parsing of large documents during (offline-) remote search
- added profiling information to search result computation, visible at search access tracker. this info shows used time for URL fetch and snippet computation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4112 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 2f1ff048ba
commit 341f7cb327

@ -61,8 +61,10 @@
<td>Offset</td>
<td>Expected Results</td>
<td>Returned Results</td>
<td>Expected Time (in ms)</td>
<td>Used Time (in ms)</td>
<td>Expected Time (ms)</td>
<td>Used Time (ms)</td>
<td>URL fetch (ms)</td>
<td>Snippet comp (ms)</td>
<td>Query</td>
</tr>
#{list}#
@ -74,6 +76,8 @@
<td>#[resultcount]#</td>
<td>#[querytime]#</td>
<td>#[resulttime]#</td>
<td>#[urltime]#</td>
<td>#[snippettime]#</td>
<td>#[querystring]#</td>
</tr>
#{/list}#
@ -109,8 +113,10 @@
<td>Date</td>
<td>Expected Results</td>
<td>Returned Results</td>
<td>Expected Time (in ms)</td>
<td>Used Time (in ms)</td>
<td>Expected Time (ms)</td>
<td>Used Time (ms)</td>
<td>URL fetch (ms)</td>
<td>Snippet comp (ms)</td>
<td>Search Word Hashes</td>
</tr>
#{list}#
@ -122,6 +128,8 @@
<td>#[resultcount]#</td>
<td>#[querytime]#</td>
<td>#[resulttime]#</td>
<td>#[urltime]#</td>
<td>#[snippettime]#</td>
<td>#[queryhashes]#</td>
</tr>
#{/list}#

@ -150,6 +150,8 @@ public class AccessTracker_p {
prop.put("page_list_" + entCount + "_querycount", ((Integer) searchProfile.get("querycount")).toString());
prop.put("page_list_" + entCount + "_querytime", ((Long) searchProfile.get("querytime")).toString());
prop.put("page_list_" + entCount + "_resultcount", ((Integer) searchProfile.get("resultcount")).toString());
prop.put("page_list_" + entCount + "_urltime", ((Long) searchProfile.get("resulturltime")).toString());
prop.put("page_list_" + entCount + "_snippettime", ((Long) searchProfile.get("resultsnippettime")).toString());
prop.put("page_list_" + entCount + "_resulttime", ((Long) searchProfile.get("resulttime")).toString());
}
prop.put("page_list", m);

@ -28,6 +28,8 @@
<resultcount>#[resultcount]#</resultcount>
<querytime>#[querytime]#</querytime>
<resulttime>#[resulttime]#</resulttime>
<urltime>#[urltime]#</urltime>
<snippettime>#[snippettime]#</snippettime>
<querystring>#[querystring]#</querystring>
</entry>
#{/list}#</localSearchLog>
@ -54,6 +56,8 @@
<resultcount>#[resultcount]#</resultcount>
<querytime>#[querytime]#</querytime>
<resulttime>#[resulttime]#</resulttime>
<urltime>#[urltime]#</urltime>
<snippettime>#[snippettime]#</snippettime>
<queryhashes>#[queryhashes]#</queryhashes>
</entry>
#{/list}#</remoteSearchLog>

@ -130,6 +130,7 @@ public final class search {
plasmaSearchQuery theQuery = null;
plasmaSearchProcessing localProcess = null;
ArrayList accu = null;
long urlRetrievalAllTime = 0, snippetComputationAllTime = 0;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint);
@ -169,6 +170,8 @@ public final class search {
plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile);
localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults());
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, localProcess, sb.wordIndex, null, true, abstractSet);
urlRetrievalAllTime = theSearch.getURLRetrievalTime();
snippetComputationAllTime = theSearch.getSnippetComputationTime();
// set statistic details of search result and find best result index set
if (theSearch.getLocalCount() == 0) {
@ -271,7 +274,7 @@ public final class search {
// prepare search statistics
Long trackerHandle = new Long(System.currentTimeMillis());
HashMap searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp);
HashMap searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp, urlRetrievalAllTime, snippetComputationAllTime);
String client = (String) header.get("CLIENTIP");
searchProfile.put("host", client);
yacySeed remotepeer = yacyCore.seedDB.lookupByIP(natLib.getInetAddress(client), true, false, false);

@ -304,7 +304,7 @@ public class yacysearch {
// prepare search statistics
Long trackerHandle = new Long(System.currentTimeMillis());
HashMap searchProfile = theQuery.resultProfile(theSearch.getLocalCount() + theSearch.getGlobalCount(), System.currentTimeMillis() - timestamp);
HashMap searchProfile = theQuery.resultProfile(theSearch.getLocalCount() + theSearch.getGlobalCount(), System.currentTimeMillis() - timestamp, theSearch.getURLRetrievalTime(), theSearch.getSnippetComputationTime());
searchProfile.put("querystring", theQuery.queryString);
searchProfile.put("time", trackerHandle);
searchProfile.put("host", client);

@ -74,6 +74,8 @@ public final class plasmaSearchEvent {
//private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again
private HashMap failedURLs; // a mapping from a urlhash to a fail reason string
TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
private long urlRetrievalAllTime;
private long snippetComputationAllTime;
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
@ -96,6 +98,8 @@ public final class plasmaSearchEvent {
this.IAmaxcounthash = null;
this.IAneardhthash = null;
this.localcount = 0;
this.urlRetrievalAllTime = 0;
this.snippetComputationAllTime = 0;
this.workerThreads = null;
this.resultList = new ArrayList(10); // this is the result set which is filled up with search results, enriched with snippets
//this.resultListLock = 0; // no locked elements until now
@ -248,6 +252,7 @@ public final class plasmaSearchEvent {
// prepare result vector directly without worker threads
int rankedIndex = 0;
process.startTimer();
while ((rankedIndex < rankedCache.container().size()) && (resultList.size() < (query.neededResults()))) {
// fetch next entry to work on
indexContainer c = rankedCache.container();
@ -261,6 +266,8 @@ public final class plasmaSearchEvent {
ResultEntry resultEntry = obtainResultEntry(page, false);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
synchronized (resultList) {
@ -312,6 +319,7 @@ public final class plasmaSearchEvent {
// load only urls if there was not yet a root url of that hash
// find the url entry
long startTime = System.currentTimeMillis();
indexURLEntry.Components comp = page.comp();
String pagetitle = comp.title().toLowerCase();
if (comp.url() == null) {
@ -320,7 +328,8 @@ public final class plasmaSearchEvent {
}
String pageurl = comp.url().toString().toLowerCase();
String pageauthor = comp.author().toLowerCase();
long dbRetrievalTime = System.currentTimeMillis() - startTime;
// check exclusion
if ((plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) ||
(plasmaSearchQuery.matches(pageurl, query.excludeHashes)) ||
@ -363,14 +372,17 @@ public final class plasmaSearchEvent {
// load snippet
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// attach text snippet
plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, fetchSnippetOnline, query.constraint.get(plasmaCondenser.flag_cat_indexof), 260, 6000);
startTime = System.currentTimeMillis();
plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, fetchSnippetOnline, query.constraint.get(plasmaCondenser.flag_cat_indexof), 180, 3000, (fetchSnippetOnline) ? Integer.MAX_VALUE : 100000);
long snippetComputationTime = System.currentTimeMillis() - startTime;
if (snippet.getErrorCode() < 11) {
// we loaded the file and found the snippet
return new ResultEntry(page, wordIndex, snippet, null); // result with snippet attached
return new ResultEntry(page, wordIndex, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (!fetchSnippetOnline) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, wordIndex, null, null); // result without snippet
return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no text snippet for URL " + comp.url());
@ -379,12 +391,15 @@ public final class plasmaSearchEvent {
}
} else {
// attach media information
startTime = System.currentTimeMillis();
ArrayList mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, fetchSnippetOnline, 6000);
long snippetComputationTime = System.currentTimeMillis() - startTime;
if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
// found media snippets, return entry
return new ResultEntry(page, wordIndex, null, mediaSnippets);
return new ResultEntry(page, wordIndex, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
} else if (!fetchSnippetOnline) {
return new ResultEntry(page, wordIndex, null, null);
return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no media snippet for URL " + comp.url());
@ -444,6 +459,14 @@ public final class plasmaSearchEvent {
public int getGlobalCount() {
return this.rankedCache.getGlobalCount();
}
public long getURLRetrievalTime() {
return this.urlRetrievalAllTime;
}
public long getSnippetComputationTime() {
return this.snippetComputationAllTime;
}
public static plasmaSearchEvent getEvent(String eventID) {
synchronized (lastEvents) {
@ -545,6 +568,8 @@ public final class plasmaSearchEvent {
ResultEntry resultEntry = obtainResultEntry(page, true);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
synchronized (resultList) {
@ -853,6 +878,7 @@ public final class plasmaSearchEvent {
}
public static class ResultEntry {
// payload objects
private indexURLEntry urlentry;
private indexURLEntry.Components urlcomps; // buffer for components
private String alternative_urlstring;
@ -860,13 +886,19 @@ public final class plasmaSearchEvent {
private plasmaSnippetCache.TextSnippet textSnippet;
private ArrayList /* of plasmaSnippetCache.MediaSnippet */ mediaSnippets;
public ResultEntry(indexURLEntry urlentry, plasmaWordIndex wordIndex, plasmaSnippetCache.TextSnippet textSnippet, ArrayList mediaSnippets) {
// statistic objects
public long dbRetrievalTime, snippetComputationTime;
public ResultEntry(indexURLEntry urlentry, plasmaWordIndex wordIndex, plasmaSnippetCache.TextSnippet textSnippet, ArrayList mediaSnippets,
long dbRetrievalTime, long snippetComputationTime) {
this.urlentry = urlentry;
this.urlcomps = urlentry.comp();
this.alternative_urlstring = null;
this.alternative_urlname = null;
this.textSnippet = textSnippet;
this.mediaSnippets = mediaSnippets;
this.dbRetrievalTime = dbRetrievalTime;
this.snippetComputationTime = snippetComputationTime;
String host = urlcomps.url().getHost();
if (host.endsWith(".yacyh")) {
// translate host into current IP

@ -239,7 +239,7 @@ public final class plasmaSearchQuery {
return hashSet2hashString(this.queryHashes) + "-" + hashSet2hashString(this.excludeHashes) + ":" + this.contentdom;
}
public HashMap resultProfile(int searchcount, long searchtime) {
public HashMap resultProfile(int searchcount, long searchtime, long urlretrieval, long snippetcomputation) {
// generate statistics about search: query, time, etc
HashMap r = new HashMap();
r.put("queryhashes", queryHashes);
@ -248,6 +248,8 @@ public final class plasmaSearchQuery {
r.put("querytime", new Long(maximumTime));
r.put("resultcount", new Integer(searchcount));
r.put("resulttime", new Long(searchtime));
r.put("resulturltime", new Long(urlretrieval));
r.put("resultsnippettime", new Long(snippetcomputation));
return r;
}
}

@ -246,7 +246,7 @@ public class plasmaSnippetCache {
return retrieveFromCache(hashes, url.hash()) != null;
}
public static TextSnippet retrieveTextSnippet(yacyURL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout) {
public static TextSnippet retrieveTextSnippet(yacyURL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) {
// heise = "0OQUNU3JSs05"
if (queryhashes.size() == 0) {
@ -276,7 +276,11 @@ public class plasmaSnippetCache {
if (resContent != null) {
// if the content was found
resContentLength = plasmaHTCache.getResourceContentLength(url);
} else if (fetchOnline) {
if ((resContentLength > maxDocLen) && (!fetchOnline)) {
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
}
} else if (fetchOnline) {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible

Loading…
Cancel
Save