steps to enhance remote search performance:

- added a file size limitation, that disallows parsing of large documents during (offline-) remote search - added profiling information to search result computation, visible at search access tracker. this info shows used time for URL fetch and snippet computation git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4112 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · 341f7cb327
parent 2f1ff048ba
commit 341f7cb327
8 changed files with 71 additions and 16 deletions
--- a/htroot/AccessTracker_p.html
+++ b/htroot/AccessTracker_p.html
@ -61,8 +61,10 @@
        <td>Offset</td>
        <td>Expected Results</td>
        <td>Returned Results</td>
-        <td>Expected Time (in ms)</td>
-        <td>Used Time (in ms)</td>
+        <td>Expected Time (ms)</td>
+        <td>Used Time (ms)</td>
+        <td>URL fetch (ms)</td>
+        <td>Snippet comp (ms)</td>
        <td>Query</td>
      </tr>
      #{list}#
@ -74,6 +76,8 @@
        <td>#[resultcount]#</td>
        <td>#[querytime]#</td>
        <td>#[resulttime]#</td>
+        <td>#[urltime]#</td>
+        <td>#[snippettime]#</td>
        <td>#[querystring]#</td>
      </tr>
      #{/list}#
@ -109,8 +113,10 @@
        <td>Date</td>
        <td>Expected Results</td>
        <td>Returned Results</td>
-        <td>Expected Time (in ms)</td>
-        <td>Used Time (in ms)</td>
+        <td>Expected Time (ms)</td>
+        <td>Used Time (ms)</td>
+        <td>URL fetch (ms)</td>
+        <td>Snippet comp (ms)</td>
        <td>Search Word Hashes</td>
      </tr>
      #{list}#
@ -122,6 +128,8 @@
        <td>#[resultcount]#</td>
        <td>#[querytime]#</td>
        <td>#[resulttime]#</td>
+        <td>#[urltime]#</td>
+        <td>#[snippettime]#</td>
        <td>#[queryhashes]#</td>
      </tr>
      #{/list}#
--- a/htroot/AccessTracker_p.java
+++ b/htroot/AccessTracker_p.java
@ -150,6 +150,8 @@ public class AccessTracker_p {
                prop.put("page_list_" + entCount + "_querycount", ((Integer) searchProfile.get("querycount")).toString());
                prop.put("page_list_" + entCount + "_querytime", ((Long) searchProfile.get("querytime")).toString());
                prop.put("page_list_" + entCount + "_resultcount", ((Integer) searchProfile.get("resultcount")).toString());
+                prop.put("page_list_" + entCount + "_urltime", ((Long) searchProfile.get("resulturltime")).toString());
+                prop.put("page_list_" + entCount + "_snippettime", ((Long) searchProfile.get("resultsnippettime")).toString());
                prop.put("page_list_" + entCount + "_resulttime", ((Long) searchProfile.get("resulttime")).toString());
            }
            prop.put("page_list", m);
--- a/htroot/AccessTracker_p.xml
+++ b/htroot/AccessTracker_p.xml
@ -28,6 +28,8 @@
        <resultcount>#[resultcount]#</resultcount>
        <querytime>#[querytime]#</querytime>
        <resulttime>#[resulttime]#</resulttime>
+        <urltime>#[urltime]#</urltime>
+        <snippettime>#[snippettime]#</snippettime>
        <querystring>#[querystring]#</querystring>	 
 	  </entry>
 	  #{/list}#</localSearchLog>
@ -54,6 +56,8 @@
        <resultcount>#[resultcount]#</resultcount>
        <querytime>#[querytime]#</querytime>
        <resulttime>#[resulttime]#</resulttime>
+        <urltime>#[urltime]#</urltime>
+        <snippettime>#[snippettime]#</snippettime>
        <queryhashes>#[queryhashes]#</queryhashes>
 	  </entry>
 	  #{/list}#</remoteSearchLog>
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@ -130,6 +130,7 @@ public final class search {
        plasmaSearchQuery theQuery = null;
        plasmaSearchProcessing localProcess = null;
        ArrayList accu = null;
+        long urlRetrievalAllTime = 0, snippetComputationAllTime = 0;
        if ((query.length() == 0) && (abstractSet != null)) {
            // this is _not_ a normal search, only a request for index abstracts
            theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint);
@ -169,6 +170,8 @@ public final class search {
            plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile);
            localProcess  = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.displayResults());
            plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, localProcess, sb.wordIndex, null, true, abstractSet);
+            urlRetrievalAllTime = theSearch.getURLRetrievalTime();
+            snippetComputationAllTime = theSearch.getSnippetComputationTime();
            
            // set statistic details of search result and find best result index set
            if (theSearch.getLocalCount() == 0) {
@ -271,7 +274,7 @@ public final class search {

        // prepare search statistics
        Long trackerHandle = new Long(System.currentTimeMillis());
-        HashMap searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp);
+        HashMap searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp, urlRetrievalAllTime, snippetComputationAllTime);
        String client = (String) header.get("CLIENTIP");
        searchProfile.put("host", client);
        yacySeed remotepeer = yacyCore.seedDB.lookupByIP(natLib.getInetAddress(client), true, false, false);
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -304,7 +304,7 @@ public class yacysearch {

            // prepare search statistics
            Long trackerHandle = new Long(System.currentTimeMillis());
-            HashMap searchProfile = theQuery.resultProfile(theSearch.getLocalCount() + theSearch.getGlobalCount(), System.currentTimeMillis() - timestamp);
+            HashMap searchProfile = theQuery.resultProfile(theSearch.getLocalCount() + theSearch.getGlobalCount(), System.currentTimeMillis() - timestamp, theSearch.getURLRetrievalTime(), theSearch.getSnippetComputationTime());
            searchProfile.put("querystring", theQuery.queryString);
            searchProfile.put("time", trackerHandle);
            searchProfile.put("host", client);
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@ -74,6 +74,8 @@ public final class plasmaSearchEvent {
    //private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again
    private HashMap failedURLs; // a mapping from a urlhash to a fail reason string
    TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
+    private long urlRetrievalAllTime;
+    private long snippetComputationAllTime;
    
    private plasmaSearchEvent(plasmaSearchQuery query,
                             plasmaSearchRankingProfile ranking,
@ -96,6 +98,8 @@ public final class plasmaSearchEvent {
        this.IAmaxcounthash = null;
        this.IAneardhthash = null;
        this.localcount = 0;
+        this.urlRetrievalAllTime = 0;
+        this.snippetComputationAllTime = 0;
        this.workerThreads = null;
        this.resultList = new ArrayList(10); // this is the result set which is filled up with search results, enriched with snippets
        //this.resultListLock = 0; // no locked elements until now
@ -248,6 +252,7 @@ public final class plasmaSearchEvent {
            // prepare result vector directly without worker threads
            int rankedIndex = 0;
            process.startTimer();
+            
            while ((rankedIndex < rankedCache.container().size()) && (resultList.size() < (query.neededResults()))) {
                // fetch next entry to work on
                indexContainer c = rankedCache.container();
@ -261,6 +266,8 @@ public final class plasmaSearchEvent {
                
                ResultEntry resultEntry = obtainResultEntry(page, false);
                if (resultEntry == null) continue; // the entry had some problems, cannot be used
+                urlRetrievalAllTime += resultEntry.dbRetrievalTime;
+                snippetComputationAllTime += resultEntry.snippetComputationTime;
                
                // place the result to the result vector
                synchronized (resultList) {
@ -312,6 +319,7 @@ public final class plasmaSearchEvent {
        // load only urls if there was not yet a root url of that hash
        // find the url entry
        
+        long startTime = System.currentTimeMillis();
        indexURLEntry.Components comp = page.comp();
        String pagetitle = comp.title().toLowerCase();
        if (comp.url() == null) {
@ -320,7 +328,8 @@ public final class plasmaSearchEvent {
        }
        String pageurl = comp.url().toString().toLowerCase();
        String pageauthor = comp.author().toLowerCase();
-            
+        long dbRetrievalTime = System.currentTimeMillis() - startTime;
+        
        // check exclusion
        if ((plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) ||
            (plasmaSearchQuery.matches(pageurl, query.excludeHashes)) ||
@ -363,14 +372,17 @@ public final class plasmaSearchEvent {
        // load snippet
        if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
            // attach text snippet
-            plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, fetchSnippetOnline, query.constraint.get(plasmaCondenser.flag_cat_indexof), 260, 6000);
+            startTime = System.currentTimeMillis();
+            plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, fetchSnippetOnline, query.constraint.get(plasmaCondenser.flag_cat_indexof), 180, 3000, (fetchSnippetOnline) ? Integer.MAX_VALUE : 100000);
+            long snippetComputationTime = System.currentTimeMillis() - startTime;
+            
            if (snippet.getErrorCode() < 11) {
                // we loaded the file and found the snippet
-                return new ResultEntry(page, wordIndex, snippet, null); // result with snippet attached
+                return new ResultEntry(page, wordIndex, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
            } else if (!fetchSnippetOnline) {
                // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
                // this may happen during a remote search, because snippet loading is omitted to retrieve results faster
-                return new ResultEntry(page, wordIndex, null, null); // result without snippet
+                return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
            } else {
                // problems with snippet fetch
                registerFailure(page.hash(), "no text snippet for URL " + comp.url());
@ -379,12 +391,15 @@ public final class plasmaSearchEvent {
            }
        } else {
            // attach media information
+            startTime = System.currentTimeMillis();
            ArrayList mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, fetchSnippetOnline, 6000);
+            long snippetComputationTime = System.currentTimeMillis() - startTime;
+            
            if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
                // found media snippets, return entry
-                return new ResultEntry(page, wordIndex, null, mediaSnippets);
+                return new ResultEntry(page, wordIndex, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
            } else if (!fetchSnippetOnline) {
-                return new ResultEntry(page, wordIndex, null, null);
+                return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime);
            } else {
                // problems with snippet fetch
                registerFailure(page.hash(), "no media snippet for URL " + comp.url());
@ -444,6 +459,14 @@ public final class plasmaSearchEvent {
    public int getGlobalCount() {
        return this.rankedCache.getGlobalCount();
    }
+    
+    public long getURLRetrievalTime() {
+        return this.urlRetrievalAllTime;
+    }
+    
+    public long getSnippetComputationTime() {
+        return this.snippetComputationAllTime;
+    }

    public static plasmaSearchEvent getEvent(String eventID) {
        synchronized (lastEvents) {
@ -545,6 +568,8 @@ public final class plasmaSearchEvent {
                
                ResultEntry resultEntry = obtainResultEntry(page, true);
                if (resultEntry == null) continue; // the entry had some problems, cannot be used
+                urlRetrievalAllTime += resultEntry.dbRetrievalTime;
+                snippetComputationAllTime += resultEntry.snippetComputationTime;
                
                // place the result to the result vector
                synchronized (resultList) {
@ -853,6 +878,7 @@ public final class plasmaSearchEvent {
    }
    
    public static class ResultEntry {
+        // payload objects
        private indexURLEntry urlentry;
        private indexURLEntry.Components urlcomps; // buffer for components
        private String alternative_urlstring;
@ -860,13 +886,19 @@ public final class plasmaSearchEvent {
        private plasmaSnippetCache.TextSnippet textSnippet;
        private ArrayList /* of plasmaSnippetCache.MediaSnippet */ mediaSnippets;
        
-        public ResultEntry(indexURLEntry urlentry, plasmaWordIndex wordIndex, plasmaSnippetCache.TextSnippet textSnippet, ArrayList mediaSnippets) {
+        // statistic objects
+        public long dbRetrievalTime, snippetComputationTime;
+        
+        public ResultEntry(indexURLEntry urlentry, plasmaWordIndex wordIndex, plasmaSnippetCache.TextSnippet textSnippet, ArrayList mediaSnippets,
+                           long dbRetrievalTime, long snippetComputationTime) {
            this.urlentry = urlentry;
            this.urlcomps = urlentry.comp();
            this.alternative_urlstring = null;
            this.alternative_urlname = null;
            this.textSnippet = textSnippet;
            this.mediaSnippets = mediaSnippets;
+            this.dbRetrievalTime = dbRetrievalTime;
+            this.snippetComputationTime = snippetComputationTime;
            String host = urlcomps.url().getHost();
            if (host.endsWith(".yacyh")) {
                // translate host into current IP
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@ -239,7 +239,7 @@ public final class plasmaSearchQuery {
        return hashSet2hashString(this.queryHashes) + "-" + hashSet2hashString(this.excludeHashes) + ":" + this.contentdom;
    }
    
-    public HashMap resultProfile(int searchcount, long searchtime) {
+    public HashMap resultProfile(int searchcount, long searchtime, long urlretrieval, long snippetcomputation) {
        // generate statistics about search: query, time, etc
        HashMap r = new HashMap();
        r.put("queryhashes", queryHashes);
@ -248,6 +248,8 @@ public final class plasmaSearchQuery {
        r.put("querytime", new Long(maximumTime));
        r.put("resultcount", new Integer(searchcount));
        r.put("resulttime", new Long(searchtime));
+        r.put("resulturltime", new Long(urlretrieval));
+        r.put("resultsnippettime", new Long(snippetcomputation));
        return r;
    }
 }
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -246,7 +246,7 @@ public class plasmaSnippetCache {
        return retrieveFromCache(hashes, url.hash()) != null;
    }
    
-    public static TextSnippet retrieveTextSnippet(yacyURL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout) {
+    public static TextSnippet retrieveTextSnippet(yacyURL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) {
        // heise = "0OQUNU3JSs05"
        
        if (queryhashes.size() == 0) {
@ -276,7 +276,11 @@ public class plasmaSnippetCache {
            if (resContent != null) {
                // if the content was found
                resContentLength = plasmaHTCache.getResourceContentLength(url);
-                } else if (fetchOnline) {
+                if ((resContentLength > maxDocLen) && (!fetchOnline)) {
+                    // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
+                    return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
+                }
+            } else if (fetchOnline) {
                // if not found try to download it
                
                // download resource using the crawler and keep resource in memory if possible