diff --git a/htroot/AccessTracker_p.html b/htroot/AccessTracker_p.html
index 9fbf4c3dc..a5360ebb5 100644
--- a/htroot/AccessTracker_p.html
+++ b/htroot/AccessTracker_p.html
@@ -61,8 +61,10 @@
Offset |
Expected Results |
Returned Results |
- Expected Time (in ms) |
- Used Time (in ms) |
+ Expected Time (ms) |
+ Used Time (ms) |
+ URL fetch (ms) |
+ Snippet comp (ms) |
Query |
#{list}#
@@ -74,6 +76,8 @@
#[resultcount]# |
#[querytime]# |
#[resulttime]# |
+ #[urltime]# |
+ #[snippettime]# |
#[querystring]# |
#{/list}#
@@ -109,8 +113,10 @@
Date |
Expected Results |
Returned Results |
- Expected Time (in ms) |
- Used Time (in ms) |
+ Expected Time (ms) |
+ Used Time (ms) |
+ URL fetch (ms) |
+ Snippet comp (ms) |
Search Word Hashes |
#{list}#
@@ -122,6 +128,8 @@
#[resultcount]# |
#[querytime]# |
#[resulttime]# |
+ #[urltime]# |
+ #[snippettime]# |
#[queryhashes]# |
#{/list}#
diff --git a/htroot/AccessTracker_p.java b/htroot/AccessTracker_p.java
index 9a49d0d20..2a3120759 100644
--- a/htroot/AccessTracker_p.java
+++ b/htroot/AccessTracker_p.java
@@ -150,6 +150,8 @@ public class AccessTracker_p {
prop.put("page_list_" + entCount + "_querycount", ((Integer) searchProfile.get("querycount")).toString());
prop.put("page_list_" + entCount + "_querytime", ((Long) searchProfile.get("querytime")).toString());
prop.put("page_list_" + entCount + "_resultcount", ((Integer) searchProfile.get("resultcount")).toString());
+ prop.put("page_list_" + entCount + "_urltime", ((Long) searchProfile.get("resulturltime")).toString());
+ prop.put("page_list_" + entCount + "_snippettime", ((Long) searchProfile.get("resultsnippettime")).toString());
prop.put("page_list_" + entCount + "_resulttime", ((Long) searchProfile.get("resulttime")).toString());
}
prop.put("page_list", m);
diff --git a/htroot/AccessTracker_p.xml b/htroot/AccessTracker_p.xml
index 0558e58bc..d8b2022fc 100644
--- a/htroot/AccessTracker_p.xml
+++ b/htroot/AccessTracker_p.xml
@@ -28,6 +28,8 @@
#[resultcount]#
#[querytime]#
#[resulttime]#
+ #[urltime]#
+ #[snippettime]#
#[querystring]#
#{/list}#
@@ -54,6 +56,8 @@
#[resultcount]#
#[querytime]#
#[resulttime]#
+ #[urltime]#
+ #[snippettime]#
#[queryhashes]#
#{/list}#
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index d11ab08cc..8666da623 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -130,6 +130,7 @@ public final class search {
plasmaSearchQuery theQuery = null;
plasmaSearchProcessing localProcess = null;
ArrayList accu = null;
+ long urlRetrievalAllTime = 0, snippetComputationAllTime = 0;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint);
@@ -169,6 +170,8 @@ public final class search {
plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile);
localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults());
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, localProcess, sb.wordIndex, null, true, abstractSet);
+ urlRetrievalAllTime = theSearch.getURLRetrievalTime();
+ snippetComputationAllTime = theSearch.getSnippetComputationTime();
// set statistic details of search result and find best result index set
if (theSearch.getLocalCount() == 0) {
@@ -271,7 +274,7 @@ public final class search {
// prepare search statistics
Long trackerHandle = new Long(System.currentTimeMillis());
- HashMap searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp);
+ HashMap searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp, urlRetrievalAllTime, snippetComputationAllTime);
String client = (String) header.get("CLIENTIP");
searchProfile.put("host", client);
yacySeed remotepeer = yacyCore.seedDB.lookupByIP(natLib.getInetAddress(client), true, false, false);
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 4b132ca48..d1192fc97 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -304,7 +304,7 @@ public class yacysearch {
// prepare search statistics
Long trackerHandle = new Long(System.currentTimeMillis());
- HashMap searchProfile = theQuery.resultProfile(theSearch.getLocalCount() + theSearch.getGlobalCount(), System.currentTimeMillis() - timestamp);
+ HashMap searchProfile = theQuery.resultProfile(theSearch.getLocalCount() + theSearch.getGlobalCount(), System.currentTimeMillis() - timestamp, theSearch.getURLRetrievalTime(), theSearch.getSnippetComputationTime());
searchProfile.put("querystring", theQuery.queryString);
searchProfile.put("time", trackerHandle);
searchProfile.put("host", client);
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 3e05eaa49..6acef49a7 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -74,6 +74,8 @@ public final class plasmaSearchEvent {
//private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again
private HashMap failedURLs; // a mapping from a urlhash to a fail reason string
TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
+ private long urlRetrievalAllTime;
+ private long snippetComputationAllTime;
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
@@ -96,6 +98,8 @@ public final class plasmaSearchEvent {
this.IAmaxcounthash = null;
this.IAneardhthash = null;
this.localcount = 0;
+ this.urlRetrievalAllTime = 0;
+ this.snippetComputationAllTime = 0;
this.workerThreads = null;
this.resultList = new ArrayList(10); // this is the result set which is filled up with search results, enriched with snippets
//this.resultListLock = 0; // no locked elements until now
@@ -248,6 +252,7 @@ public final class plasmaSearchEvent {
// prepare result vector directly without worker threads
int rankedIndex = 0;
process.startTimer();
+
while ((rankedIndex < rankedCache.container().size()) && (resultList.size() < (query.neededResults()))) {
// fetch next entry to work on
indexContainer c = rankedCache.container();
@@ -261,6 +266,8 @@ public final class plasmaSearchEvent {
ResultEntry resultEntry = obtainResultEntry(page, false);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
+ urlRetrievalAllTime += resultEntry.dbRetrievalTime;
+ snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
synchronized (resultList) {
@@ -312,6 +319,7 @@ public final class plasmaSearchEvent {
// load only urls if there was not yet a root url of that hash
// find the url entry
+ long startTime = System.currentTimeMillis();
indexURLEntry.Components comp = page.comp();
String pagetitle = comp.title().toLowerCase();
if (comp.url() == null) {
@@ -320,7 +328,8 @@ public final class plasmaSearchEvent {
}
String pageurl = comp.url().toString().toLowerCase();
String pageauthor = comp.author().toLowerCase();
-
+ long dbRetrievalTime = System.currentTimeMillis() - startTime;
+
// check exclusion
if ((plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) ||
(plasmaSearchQuery.matches(pageurl, query.excludeHashes)) ||
@@ -363,14 +372,17 @@ public final class plasmaSearchEvent {
// load snippet
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// attach text snippet
- plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, fetchSnippetOnline, query.constraint.get(plasmaCondenser.flag_cat_indexof), 260, 6000);
+ startTime = System.currentTimeMillis();
+ plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, fetchSnippetOnline, query.constraint.get(plasmaCondenser.flag_cat_indexof), 180, 3000, (fetchSnippetOnline) ? Integer.MAX_VALUE : 100000);
+ long snippetComputationTime = System.currentTimeMillis() - startTime;
+
if (snippet.getErrorCode() < 11) {
// we loaded the file and found the snippet
- return new ResultEntry(page, wordIndex, snippet, null); // result with snippet attached
+ return new ResultEntry(page, wordIndex, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (!fetchSnippetOnline) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
- return new ResultEntry(page, wordIndex, null, null); // result without snippet
+ return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no text snippet for URL " + comp.url());
@@ -379,12 +391,15 @@ public final class plasmaSearchEvent {
}
} else {
// attach media information
+ startTime = System.currentTimeMillis();
ArrayList mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, fetchSnippetOnline, 6000);
+ long snippetComputationTime = System.currentTimeMillis() - startTime;
+
if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
// found media snippets, return entry
- return new ResultEntry(page, wordIndex, null, mediaSnippets);
+ return new ResultEntry(page, wordIndex, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
} else if (!fetchSnippetOnline) {
- return new ResultEntry(page, wordIndex, null, null);
+ return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no media snippet for URL " + comp.url());
@@ -444,6 +459,14 @@ public final class plasmaSearchEvent {
public int getGlobalCount() {
return this.rankedCache.getGlobalCount();
}
+
+ public long getURLRetrievalTime() {
+ return this.urlRetrievalAllTime;
+ }
+
+ public long getSnippetComputationTime() {
+ return this.snippetComputationAllTime;
+ }
public static plasmaSearchEvent getEvent(String eventID) {
synchronized (lastEvents) {
@@ -545,6 +568,8 @@ public final class plasmaSearchEvent {
ResultEntry resultEntry = obtainResultEntry(page, true);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
+ urlRetrievalAllTime += resultEntry.dbRetrievalTime;
+ snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
synchronized (resultList) {
@@ -853,6 +878,7 @@ public final class plasmaSearchEvent {
}
public static class ResultEntry {
+ // payload objects
private indexURLEntry urlentry;
private indexURLEntry.Components urlcomps; // buffer for components
private String alternative_urlstring;
@@ -860,13 +886,19 @@ public final class plasmaSearchEvent {
private plasmaSnippetCache.TextSnippet textSnippet;
private ArrayList /* of plasmaSnippetCache.MediaSnippet */ mediaSnippets;
- public ResultEntry(indexURLEntry urlentry, plasmaWordIndex wordIndex, plasmaSnippetCache.TextSnippet textSnippet, ArrayList mediaSnippets) {
+ // statistic objects
+ public long dbRetrievalTime, snippetComputationTime;
+
+ public ResultEntry(indexURLEntry urlentry, plasmaWordIndex wordIndex, plasmaSnippetCache.TextSnippet textSnippet, ArrayList mediaSnippets,
+ long dbRetrievalTime, long snippetComputationTime) {
this.urlentry = urlentry;
this.urlcomps = urlentry.comp();
this.alternative_urlstring = null;
this.alternative_urlname = null;
this.textSnippet = textSnippet;
this.mediaSnippets = mediaSnippets;
+ this.dbRetrievalTime = dbRetrievalTime;
+ this.snippetComputationTime = snippetComputationTime;
String host = urlcomps.url().getHost();
if (host.endsWith(".yacyh")) {
// translate host into current IP
diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java
index 0c9fd9c20..53a607978 100644
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@@ -239,7 +239,7 @@ public final class plasmaSearchQuery {
return hashSet2hashString(this.queryHashes) + "-" + hashSet2hashString(this.excludeHashes) + ":" + this.contentdom;
}
- public HashMap resultProfile(int searchcount, long searchtime) {
+ public HashMap resultProfile(int searchcount, long searchtime, long urlretrieval, long snippetcomputation) {
// generate statistics about search: query, time, etc
HashMap r = new HashMap();
r.put("queryhashes", queryHashes);
@@ -248,6 +248,8 @@ public final class plasmaSearchQuery {
r.put("querytime", new Long(maximumTime));
r.put("resultcount", new Integer(searchcount));
r.put("resulttime", new Long(searchtime));
+ r.put("resulturltime", new Long(urlretrieval));
+ r.put("resultsnippettime", new Long(snippetcomputation));
return r;
}
}
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index f13cc1711..13999e78f 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -246,7 +246,7 @@ public class plasmaSnippetCache {
return retrieveFromCache(hashes, url.hash()) != null;
}
- public static TextSnippet retrieveTextSnippet(yacyURL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout) {
+ public static TextSnippet retrieveTextSnippet(yacyURL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) {
// heise = "0OQUNU3JSs05"
if (queryhashes.size() == 0) {
@@ -276,7 +276,11 @@ public class plasmaSnippetCache {
if (resContent != null) {
// if the content was found
resContentLength = plasmaHTCache.getResourceContentLength(url);
- } else if (fetchOnline) {
+ if ((resContentLength > maxDocLen) && (!fetchOnline)) {
+ // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
+ return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
+ }
+ } else if (fetchOnline) {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible