identified and fixed search performance problem caused by

snippet loading. Some access to header-db had been twice and even
more times in some cases. Snippet resource loading fixed.
Furthermore the snippet loading during remote search within the
remote peer has been disabled, but can be switched on remotely by
new flag 'includesnippet=true'

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2688 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 4d9e1b43dd
commit 00746ca232

@ -91,6 +91,7 @@ public final class search {
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
final String prefer = post.get("prefer", ""); final String prefer = post.get("prefer", "");
final String filter = post.get("filter", ".*"); final String filter = post.get("filter", ".*");
final boolean includesnippet = post.get("includesnippet", "false").equals("true");
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -200,11 +201,15 @@ public final class search {
plasmaSnippetCache.Snippet snippet; plasmaSnippetCache.Snippet snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) { while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = acc.nextElement(); urlentry = acc.nextElement();
snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260); if (includesnippet) {
if (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH) { snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260);
} else {
snippet = null;
}
if ((snippet != null) && (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH)) {
// suppress line: there is no match in that resource // suppress line: there is no match in that resource
} else { } else {
if (snippet.exists()) { if ((snippet != null) && (snippet.exists())) {
resource = urlentry.toString(snippet.getLineRaw()); resource = urlentry.toString(snippet.getLineRaw());
} else { } else {
resource = urlentry.toString(); resource = urlentry.toString();

@ -192,7 +192,6 @@ public class plasmaSnippetCache {
try { try {
// trying to load the resource from the cache // trying to load the resource from the cache
resource = this.cacheManager.loadResourceContent(url); resource = this.cacheManager.loadResourceContent(url);
docInfo = this.cacheManager.loadResourceInfo(url);
// if not found try to download it // if not found try to download it
if ((resource == null) && (fetchOnline)) { if ((resource == null) && (fetchOnline)) {
@ -200,22 +199,21 @@ public class plasmaSnippetCache {
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000); plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
// getting resource metadata (e.g. the http headers for http resources) // getting resource metadata (e.g. the http headers for http resources)
if (entry != null) { if (entry != null) docInfo = entry.getDocumentInfo();
docInfo = entry.getDocumentInfo();
}
// now the resource should be stored in the cache, load body // read resource body
resource = this.cacheManager.loadResourceContent(url); resource = entry.cacheArray();
if (resource == null) { if (resource == null) {
//System.out.println("cannot load document for URL " + url); return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL");
} }
source = SOURCE_WEB; source = SOURCE_WEB;
} }
} catch (Exception e) { } catch (Exception e) {
if (!(e instanceof plasmaCrawlerException)) e.printStackTrace(); if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource from web: " + e.getMessage()); return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
} }
if (resource == null) return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
/* =========================================================================== /* ===========================================================================
* PARSING RESOURCE * PARSING RESOURCE
@ -459,11 +457,12 @@ public class plasmaSnippetCache {
docInfo = this.cacheManager.loadResourceInfo(url); docInfo = this.cacheManager.loadResourceInfo(url);
} catch (Exception e) { } catch (Exception e) {
// ignore this. resource info loading failed // ignore this. resource info loading failed
} }
}
// TODO: we need a better solution here // TODO: we need a better solution here
// encapsulate this in the crawlLoader class // encapsulate this in the crawlLoader class
if (url.getProtocol().startsWith("http")) { if ((docInfo == null) && (url.getProtocol().startsWith("http"))) {
// getting URL mimeType // getting URL mimeType
try { try {
httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig); httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig);
@ -472,8 +471,6 @@ public class plasmaSnippetCache {
// ingore this. http header download failed // ingore this. http header download failed
} }
} }
}
if (docInfo == null) { if (docInfo == null) {
String filename = this.cacheManager.getCachePath(url).getName(); String filename = this.cacheManager.getCachePath(url).getName();

@ -2033,9 +2033,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String host, hash, address, descr = ""; String host, hash, address, descr = "";
yacySeed seed; yacySeed seed;
plasmaSnippetCache.Snippet snippet; plasmaSnippetCache.Snippet snippet;
boolean includeSnippets = false;
String formerSearch = query.words(" "); String formerSearch = query.words(" ");
long targetTime = timestamp + query.maximumTime; long targetTime = timestamp + query.maximumTime;
if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 5000; if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000;
while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) { while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) {
urlentry = acc.nextElement(); urlentry = acc.nextElement();
url = urlentry.url(); url = urlentry.url();
@ -2076,8 +2077,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//addScoreForked(ref, gs, urlstring.split("/")); //addScoreForked(ref, gs, urlstring.split("/"));
URL wordURL; URL wordURL;
if (urlstring.matches(query.urlMask)) { //.* is default if (urlstring.matches(query.urlMask)) { //.* is default
snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260); if (includeSnippets) {
if (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH) { snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260);
} else {
snippet = null;
}
if ((snippet != null) && (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH)) {
// suppress line: there is no match in that resource // suppress line: there is no match in that resource
} else { } else {
prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0); prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0);
@ -2097,7 +2102,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
((indexURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") + ((indexURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") +
(((wordURL = indexURL.probablyWordURL(urlhash, query.words(""))) != null) ? ", probablyWordURL=" + wordURL.toNormalform() : "")); (((wordURL = indexURL.probablyWordURL(urlhash, query.words(""))) != null) ? ", probablyWordURL=" + wordURL.toNormalform() : ""));
// adding snippet if available // adding snippet if available
if (snippet.exists()) { if ((snippet != null) && (snippet.exists())) {
prop.put("type_results_" + i + "_snippet", 1); prop.put("type_results_" + i + "_snippet", 1);
prop.put("type_results_" + i + "_snippet_text", snippet.getLineMarked(query.queryHashes)); prop.put("type_results_" + i + "_snippet_text", snippet.getLineMarked(query.queryHashes));
} else { } else {

Loading…
Cancel
Save