From 8a4f297324ed33abed7b05914fe00303064d3335 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 30 Jun 2005 00:01:53 +0000 Subject: [PATCH] fixed/enhanced snippet error-handling; suppression of results where no snippet exists git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@347 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../de/anomic/plasma/plasmaSnippetCache.java | 32 +++++++----- .../de/anomic/plasma/plasmaSwitchboard.java | 50 +++++++++++-------- 2 files changed, 48 insertions(+), 34 deletions(-) diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index d3bdbfdf3..dfe9b1a36 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -56,11 +56,17 @@ import de.anomic.yacy.yacySearch; public class plasmaSnippetCache { private static final int maxCache = 500; + public static final int SOURCE_CACHE = 0; - public static final int SOURCE_FILE = 0; - public static final int SOURCE_WEB = 0; - public static final int SOURCE_ERROR = 0; + public static final int SOURCE_FILE = 1; + public static final int SOURCE_WEB = 2; + public static final int ERROR_NO_HASH_GIVEN = 11; + public static final int ERROR_SOURCE_LOADING = 12; + public static final int ERROR_RESOURCE_LOADING = 13; + public static final int ERROR_PARSER_FAILED = 14; + public static final int ERROR_PARSER_NO_LINES = 15; + public static final int ERROR_NO_MATCH = 16; private int snippetsScoreCounter; private kelondroMScoreCluster snippetsScore; @@ -90,10 +96,10 @@ public class plasmaSnippetCache { public String line; public String error; public int source; - public result(String line, int source, String error) { + public result(String line, int source, String errortext) { this.line = line; this.source = source; - this.error = error; + this.error = errortext; } public String toString() { return line; @@ -108,21 +114,21 @@ public class plasmaSnippetCache { // heise = "0OQUNU3JSs05" if (queryhashes.size() == 0) { //System.out.println("found no queryhashes for url retrieve " + url); - return new result(null, SOURCE_ERROR, "no query hashes given"); + return new result(null, ERROR_NO_HASH_GIVEN, "no query hashes given"); } String urlhash = plasmaURL.urlHash(url); // try to get snippet from snippetCache + int source = SOURCE_CACHE; String wordhashes = yacySearch.set2string(queryhashes); String line = retrieveFromCache(wordhashes, urlhash); if (line != null) { //System.out.println("found snippet for url " + url + " in cache: " + line); - return new result(line, SOURCE_CACHE, null); + return new result(line, source, null); } // if the snippet is not in the cache, we can try to get it from the htcache byte[] resource = null; - int source = SOURCE_CACHE; try { resource = cacheManager.loadResource(url); if ((fetchOnline) && (resource == null)) { @@ -131,27 +137,27 @@ public class plasmaSnippetCache { source = SOURCE_WEB; } } catch (IOException e) { - return new result(null, SOURCE_ERROR, "error loading resource from web: " + e.getMessage()); + return new result(null, ERROR_SOURCE_LOADING, "error loading resource from web: " + e.getMessage()); } if (resource == null) { //System.out.println("cannot load document for url " + url); - return new result(null, SOURCE_ERROR, "error loading resource from web, cacheManager returned NULL"); + return new result(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL"); } plasmaParserDocument document = parseDocument(url, resource); - if (document == null) return new result(null, SOURCE_ERROR, "parser error/failed"); // cannot be parsed + if (document == null) return new result(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed //System.out.println("loaded document for url " + url); String[] sentences = document.getSentences(); //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]); if ((sentences == null) || (sentences.length == 0)) { //System.out.println("found no sentences in url " + url); - return new result(null, SOURCE_ERROR, "parser returned no sentences"); + return new result(null, ERROR_PARSER_NO_LINES, "parser returned no sentences"); } // we have found a parseable non-empty file: use the lines line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120); //System.out.println("loaded snippet for url " + url + ": " + line); - if (line == null) return new result(null, SOURCE_ERROR, "no matching snippet found"); + if (line == null) return new result(null, ERROR_NO_MATCH, "no matching snippet found"); if (line.length() > 120) line = line.substring(0, 120); // finally store this snippet in our own cache diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 1c4ec2b46..639131e7b 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -374,9 +374,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // test routine for snippet fetch // url = /www.heise.de/mobil/newsticker/meldung/mail/54980 - Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise' + //Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise' //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true); - plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true); + //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true); } private static String ppRamString(int bytes) { @@ -1200,7 +1200,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logDebug("snippetFetcher: try to get URL " + url); plasmaSnippetCache.result snippet = snippetCache.retrieve(url, queryhashes, true); if (snippet.line == null) - log.logDebug("snippetFetcher: cannot get URL " + url + ". error: " + snippet.error); + log.logDebug("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error); else log.logDebug("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source); } @@ -1313,20 +1313,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser //addScoreForked(ref, gs, descr.split(" ")); //addScoreForked(ref, gs, urlstring.split("/")); if (urlstring.matches(urlmask)) { //.* is default - prop.put("results_" + i + "_description", descr); - prop.put("results_" + i + "_url", urlstring); - prop.put("results_" + i + "_urlname", urlname); - prop.put("results_" + i + "_date", dateString(urlentry.moddate())); - prop.put("results_" + i + "_size", Long.toString(urlentry.size())); snippet = snippetCache.retrieve(url, queryhashes, false); - if (snippet.line == null) { - prop.put("results_" + i + "_snippet", 0); - prop.put("results_" + i + "_snippet_text", ""); + if (snippet.source == plasmaSnippetCache.ERROR_NO_MATCH) { + // suppress line: there is no match in that resource } else { - prop.put("results_" + i + "_snippet", 1); - prop.put("results_" + i + "_snippet_text", snippet.line); + prop.put("results_" + i + "_description", descr); + prop.put("results_" + i + "_url", urlstring); + prop.put("results_" + i + "_urlname", urlname); + prop.put("results_" + i + "_date", dateString(urlentry.moddate())); + prop.put("results_" + i + "_size", Long.toString(urlentry.size())); + if (snippet.line == null) { + prop.put("results_" + i + "_snippet", 0); + prop.put("results_" + i + "_snippet_text", ""); + } else { + prop.put("results_" + i + "_snippet", 1); + prop.put("results_" + i + "_snippet_text", snippet.line); + } + i++; } - i++; } } log.logDebug("SEARCH TIME AFTER RESULT PREPARATION: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); @@ -1396,14 +1400,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser while ((acc.hasMoreElements()) && (i < count)) { urlentry = acc.nextElement(); snippet = snippetCache.retrieve(urlentry.url(), hashes, false); - if (snippet.line == null) { - resource = urlentry.toString(); + if (snippet.source == plasmaSnippetCache.ERROR_NO_MATCH) { + // suppress line: there is no match in that resource } else { - resource = urlentry.toString(snippet.line); - } - if (resource != null) { - links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString); - i++; + if (snippet.line == null) { + resource = urlentry.toString(); + } else { + resource = urlentry.toString(snippet.line); + } + if (resource != null) { + links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString); + i++; + } } } prop.put("links", links.toString());