diff --git a/htroot/AccessTracker_p.html b/htroot/AccessTracker_p.html index 9fbf4c3dc..a5360ebb5 100644 --- a/htroot/AccessTracker_p.html +++ b/htroot/AccessTracker_p.html @@ -61,8 +61,10 @@ Offset Expected Results Returned Results - Expected Time (in ms) - Used Time (in ms) + Expected Time (ms) + Used Time (ms) + URL fetch (ms) + Snippet comp (ms) Query #{list}# @@ -74,6 +76,8 @@ #[resultcount]# #[querytime]# #[resulttime]# + #[urltime]# + #[snippettime]# #[querystring]# #{/list}# @@ -109,8 +113,10 @@ Date Expected Results Returned Results - Expected Time (in ms) - Used Time (in ms) + Expected Time (ms) + Used Time (ms) + URL fetch (ms) + Snippet comp (ms) Search Word Hashes #{list}# @@ -122,6 +128,8 @@ #[resultcount]# #[querytime]# #[resulttime]# + #[urltime]# + #[snippettime]# #[queryhashes]# #{/list}# diff --git a/htroot/AccessTracker_p.java b/htroot/AccessTracker_p.java index 9a49d0d20..2a3120759 100644 --- a/htroot/AccessTracker_p.java +++ b/htroot/AccessTracker_p.java @@ -150,6 +150,8 @@ public class AccessTracker_p { prop.put("page_list_" + entCount + "_querycount", ((Integer) searchProfile.get("querycount")).toString()); prop.put("page_list_" + entCount + "_querytime", ((Long) searchProfile.get("querytime")).toString()); prop.put("page_list_" + entCount + "_resultcount", ((Integer) searchProfile.get("resultcount")).toString()); + prop.put("page_list_" + entCount + "_urltime", ((Long) searchProfile.get("resulturltime")).toString()); + prop.put("page_list_" + entCount + "_snippettime", ((Long) searchProfile.get("resultsnippettime")).toString()); prop.put("page_list_" + entCount + "_resulttime", ((Long) searchProfile.get("resulttime")).toString()); } prop.put("page_list", m); diff --git a/htroot/AccessTracker_p.xml b/htroot/AccessTracker_p.xml index 0558e58bc..d8b2022fc 100644 --- a/htroot/AccessTracker_p.xml +++ b/htroot/AccessTracker_p.xml @@ -28,6 +28,8 @@ #[resultcount]# #[querytime]# #[resulttime]# + #[urltime]# + #[snippettime]# #[querystring]# #{/list}# @@ -54,6 +56,8 @@ #[resultcount]# #[querytime]# #[resulttime]# + #[urltime]# + #[snippettime]# #[queryhashes]# #{/list}# diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index d11ab08cc..8666da623 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -130,6 +130,7 @@ public final class search { plasmaSearchQuery theQuery = null; plasmaSearchProcessing localProcess = null; ArrayList accu = null; + long urlRetrievalAllTime = 0, snippetComputationAllTime = 0; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint); @@ -169,6 +170,8 @@ public final class search { plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile); localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults()); plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, localProcess, sb.wordIndex, null, true, abstractSet); + urlRetrievalAllTime = theSearch.getURLRetrievalTime(); + snippetComputationAllTime = theSearch.getSnippetComputationTime(); // set statistic details of search result and find best result index set if (theSearch.getLocalCount() == 0) { @@ -271,7 +274,7 @@ public final class search { // prepare search statistics Long trackerHandle = new Long(System.currentTimeMillis()); - HashMap searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp); + HashMap searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp, urlRetrievalAllTime, snippetComputationAllTime); String client = (String) header.get("CLIENTIP"); searchProfile.put("host", client); yacySeed remotepeer = yacyCore.seedDB.lookupByIP(natLib.getInetAddress(client), true, false, false); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 4b132ca48..d1192fc97 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -304,7 +304,7 @@ public class yacysearch { // prepare search statistics Long trackerHandle = new Long(System.currentTimeMillis()); - HashMap searchProfile = theQuery.resultProfile(theSearch.getLocalCount() + theSearch.getGlobalCount(), System.currentTimeMillis() - timestamp); + HashMap searchProfile = theQuery.resultProfile(theSearch.getLocalCount() + theSearch.getGlobalCount(), System.currentTimeMillis() - timestamp, theSearch.getURLRetrievalTime(), theSearch.getSnippetComputationTime()); searchProfile.put("querystring", theQuery.queryString); searchProfile.put("time", trackerHandle); searchProfile.put("host", client); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 3e05eaa49..6acef49a7 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -74,6 +74,8 @@ public final class plasmaSearchEvent { //private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again private HashMap failedURLs; // a mapping from a urlhash to a fail reason string TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets + private long urlRetrievalAllTime; + private long snippetComputationAllTime; private plasmaSearchEvent(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, @@ -96,6 +98,8 @@ public final class plasmaSearchEvent { this.IAmaxcounthash = null; this.IAneardhthash = null; this.localcount = 0; + this.urlRetrievalAllTime = 0; + this.snippetComputationAllTime = 0; this.workerThreads = null; this.resultList = new ArrayList(10); // this is the result set which is filled up with search results, enriched with snippets //this.resultListLock = 0; // no locked elements until now @@ -248,6 +252,7 @@ public final class plasmaSearchEvent { // prepare result vector directly without worker threads int rankedIndex = 0; process.startTimer(); + while ((rankedIndex < rankedCache.container().size()) && (resultList.size() < (query.neededResults()))) { // fetch next entry to work on indexContainer c = rankedCache.container(); @@ -261,6 +266,8 @@ public final class plasmaSearchEvent { ResultEntry resultEntry = obtainResultEntry(page, false); if (resultEntry == null) continue; // the entry had some problems, cannot be used + urlRetrievalAllTime += resultEntry.dbRetrievalTime; + snippetComputationAllTime += resultEntry.snippetComputationTime; // place the result to the result vector synchronized (resultList) { @@ -312,6 +319,7 @@ public final class plasmaSearchEvent { // load only urls if there was not yet a root url of that hash // find the url entry + long startTime = System.currentTimeMillis(); indexURLEntry.Components comp = page.comp(); String pagetitle = comp.title().toLowerCase(); if (comp.url() == null) { @@ -320,7 +328,8 @@ public final class plasmaSearchEvent { } String pageurl = comp.url().toString().toLowerCase(); String pageauthor = comp.author().toLowerCase(); - + long dbRetrievalTime = System.currentTimeMillis() - startTime; + // check exclusion if ((plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) || (plasmaSearchQuery.matches(pageurl, query.excludeHashes)) || @@ -363,14 +372,17 @@ public final class plasmaSearchEvent { // load snippet if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { // attach text snippet - plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, fetchSnippetOnline, query.constraint.get(plasmaCondenser.flag_cat_indexof), 260, 6000); + startTime = System.currentTimeMillis(); + plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, fetchSnippetOnline, query.constraint.get(plasmaCondenser.flag_cat_indexof), 180, 3000, (fetchSnippetOnline) ? Integer.MAX_VALUE : 100000); + long snippetComputationTime = System.currentTimeMillis() - startTime; + if (snippet.getErrorCode() < 11) { // we loaded the file and found the snippet - return new ResultEntry(page, wordIndex, snippet, null); // result with snippet attached + return new ResultEntry(page, wordIndex, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached } else if (!fetchSnippetOnline) { // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result // this may happen during a remote search, because snippet loading is omitted to retrieve results faster - return new ResultEntry(page, wordIndex, null, null); // result without snippet + return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet } else { // problems with snippet fetch registerFailure(page.hash(), "no text snippet for URL " + comp.url()); @@ -379,12 +391,15 @@ public final class plasmaSearchEvent { } } else { // attach media information + startTime = System.currentTimeMillis(); ArrayList mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, fetchSnippetOnline, 6000); + long snippetComputationTime = System.currentTimeMillis() - startTime; + if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) { // found media snippets, return entry - return new ResultEntry(page, wordIndex, null, mediaSnippets); + return new ResultEntry(page, wordIndex, null, mediaSnippets, dbRetrievalTime, snippetComputationTime); } else if (!fetchSnippetOnline) { - return new ResultEntry(page, wordIndex, null, null); + return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime); } else { // problems with snippet fetch registerFailure(page.hash(), "no media snippet for URL " + comp.url()); @@ -444,6 +459,14 @@ public final class plasmaSearchEvent { public int getGlobalCount() { return this.rankedCache.getGlobalCount(); } + + public long getURLRetrievalTime() { + return this.urlRetrievalAllTime; + } + + public long getSnippetComputationTime() { + return this.snippetComputationAllTime; + } public static plasmaSearchEvent getEvent(String eventID) { synchronized (lastEvents) { @@ -545,6 +568,8 @@ public final class plasmaSearchEvent { ResultEntry resultEntry = obtainResultEntry(page, true); if (resultEntry == null) continue; // the entry had some problems, cannot be used + urlRetrievalAllTime += resultEntry.dbRetrievalTime; + snippetComputationAllTime += resultEntry.snippetComputationTime; // place the result to the result vector synchronized (resultList) { @@ -853,6 +878,7 @@ public final class plasmaSearchEvent { } public static class ResultEntry { + // payload objects private indexURLEntry urlentry; private indexURLEntry.Components urlcomps; // buffer for components private String alternative_urlstring; @@ -860,13 +886,19 @@ public final class plasmaSearchEvent { private plasmaSnippetCache.TextSnippet textSnippet; private ArrayList /* of plasmaSnippetCache.MediaSnippet */ mediaSnippets; - public ResultEntry(indexURLEntry urlentry, plasmaWordIndex wordIndex, plasmaSnippetCache.TextSnippet textSnippet, ArrayList mediaSnippets) { + // statistic objects + public long dbRetrievalTime, snippetComputationTime; + + public ResultEntry(indexURLEntry urlentry, plasmaWordIndex wordIndex, plasmaSnippetCache.TextSnippet textSnippet, ArrayList mediaSnippets, + long dbRetrievalTime, long snippetComputationTime) { this.urlentry = urlentry; this.urlcomps = urlentry.comp(); this.alternative_urlstring = null; this.alternative_urlname = null; this.textSnippet = textSnippet; this.mediaSnippets = mediaSnippets; + this.dbRetrievalTime = dbRetrievalTime; + this.snippetComputationTime = snippetComputationTime; String host = urlcomps.url().getHost(); if (host.endsWith(".yacyh")) { // translate host into current IP diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 0c9fd9c20..53a607978 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -239,7 +239,7 @@ public final class plasmaSearchQuery { return hashSet2hashString(this.queryHashes) + "-" + hashSet2hashString(this.excludeHashes) + ":" + this.contentdom; } - public HashMap resultProfile(int searchcount, long searchtime) { + public HashMap resultProfile(int searchcount, long searchtime, long urlretrieval, long snippetcomputation) { // generate statistics about search: query, time, etc HashMap r = new HashMap(); r.put("queryhashes", queryHashes); @@ -248,6 +248,8 @@ public final class plasmaSearchQuery { r.put("querytime", new Long(maximumTime)); r.put("resultcount", new Integer(searchcount)); r.put("resulttime", new Long(searchtime)); + r.put("resulturltime", new Long(urlretrieval)); + r.put("resultsnippettime", new Long(snippetcomputation)); return r; } } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index f13cc1711..13999e78f 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -246,7 +246,7 @@ public class plasmaSnippetCache { return retrieveFromCache(hashes, url.hash()) != null; } - public static TextSnippet retrieveTextSnippet(yacyURL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout) { + public static TextSnippet retrieveTextSnippet(yacyURL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) { // heise = "0OQUNU3JSs05" if (queryhashes.size() == 0) { @@ -276,7 +276,11 @@ public class plasmaSnippetCache { if (resContent != null) { // if the content was found resContentLength = plasmaHTCache.getResourceContentLength(url); - } else if (fetchOnline) { + if ((resContentLength > maxDocLen) && (!fetchOnline)) { + // content may be too large to be parsed here. To be fast, we omit calculation of snippet here + return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes"); + } + } else if (fetchOnline) { // if not found try to download it // download resource using the crawler and keep resource in memory if possible