From 00746ca232bdbc1b331d17b880cfa053aed5fafc Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 2 Oct 2006 01:15:02 +0000 Subject: [PATCH] identified and fixed search performance problem caused by snippet loading. Some access to header-db had been twice and even more times in some cases. Snippet resource loading fixed. Furthermore the snippet loading during remote search within the remote peer has been disabled, but can be switched on remotely by new flag 'includesnippet=true' git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2688 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/search.java | 11 +++++--- .../de/anomic/plasma/plasmaSnippetCache.java | 25 ++++++++----------- .../de/anomic/plasma/plasmaSwitchboard.java | 13 +++++++--- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index c621a92ea..e5d9469aa 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -91,6 +91,7 @@ public final class search { final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); final String prefer = post.get("prefer", ""); final String filter = post.get("filter", ".*"); + final boolean includesnippet = post.get("includesnippet", "false").equals("true"); // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time @@ -200,11 +201,15 @@ public final class search { plasmaSnippetCache.Snippet snippet; while ((acc.hasMoreElements()) && (i < squery.wantedResults)) { urlentry = acc.nextElement(); - snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260); - if (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH) { + if (includesnippet) { + snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260); + } else { + snippet = null; + } + if ((snippet != null) && (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH)) { // suppress line: there is no match in that resource } else { - if (snippet.exists()) { + if ((snippet != null) && (snippet.exists())) { resource = urlentry.toString(snippet.getLineRaw()); } else { resource = urlentry.toString(); diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 10e06f0bf..c0ad8218a 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -192,7 +192,6 @@ public class plasmaSnippetCache { try { // trying to load the resource from the cache resource = this.cacheManager.loadResourceContent(url); - docInfo = this.cacheManager.loadResourceInfo(url); // if not found try to download it if ((resource == null) && (fetchOnline)) { @@ -200,22 +199,21 @@ public class plasmaSnippetCache { plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000); // getting resource metadata (e.g. the http headers for http resources) - if (entry != null) { - docInfo = entry.getDocumentInfo(); - } + if (entry != null) docInfo = entry.getDocumentInfo(); - // now the resource should be stored in the cache, load body - resource = this.cacheManager.loadResourceContent(url); + // read resource body + resource = entry.cacheArray(); if (resource == null) { - //System.out.println("cannot load document for URL " + url); - return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL"); + return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL"); } source = SOURCE_WEB; } } catch (Exception e) { if (!(e instanceof plasmaCrawlerException)) e.printStackTrace(); - return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource from web: " + e.getMessage()); + return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage()); } + + if (resource == null) return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available"); /* =========================================================================== * PARSING RESOURCE @@ -459,11 +457,12 @@ public class plasmaSnippetCache { docInfo = this.cacheManager.loadResourceInfo(url); } catch (Exception e) { // ignore this. resource info loading failed - } - + } + } + // TODO: we need a better solution here // encapsulate this in the crawlLoader class - if (url.getProtocol().startsWith("http")) { + if ((docInfo == null) && (url.getProtocol().startsWith("http"))) { // getting URL mimeType try { httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig); @@ -472,8 +471,6 @@ public class plasmaSnippetCache { // ingore this. http header download failed } } - - } if (docInfo == null) { String filename = this.cacheManager.getCachePath(url).getName(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 798e5e8fd..8b33ed27a 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -2033,9 +2033,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String host, hash, address, descr = ""; yacySeed seed; plasmaSnippetCache.Snippet snippet; + boolean includeSnippets = false; String formerSearch = query.words(" "); long targetTime = timestamp + query.maximumTime; - if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 5000; + if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000; while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) { urlentry = acc.nextElement(); url = urlentry.url(); @@ -2076,8 +2077,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser //addScoreForked(ref, gs, urlstring.split("/")); URL wordURL; if (urlstring.matches(query.urlMask)) { //.* is default - snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260); - if (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH) { + if (includeSnippets) { + snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260); + } else { + snippet = null; + } + if ((snippet != null) && (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH)) { // suppress line: there is no match in that resource } else { prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0); @@ -2097,7 +2102,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ((indexURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") + (((wordURL = indexURL.probablyWordURL(urlhash, query.words(""))) != null) ? ", probablyWordURL=" + wordURL.toNormalform() : "")); // adding snippet if available - if (snippet.exists()) { + if ((snippet != null) && (snippet.exists())) { prop.put("type_results_" + i + "_snippet", 1); prop.put("type_results_" + i + "_snippet_text", snippet.getLineMarked(query.queryHashes)); } else {