From 9f929b5438cb5a39f186dfa81aa7a738e0d66988 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 13 Mar 2007 22:18:36 +0000 Subject: [PATCH] better snippet handling in case of snippet load fail see also http://www.yacy-forum.de/viewtopic.php?p=31096#31096 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3475 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/DetailedSearch.java | 2 +- htroot/xml/snippet.java | 16 +- htroot/yacysearch.java | 2 +- .../de/anomic/kelondro/kelondroMSetTools.java | 10 ++ .../de/anomic/plasma/plasmaSearchQuery.java | 13 -- .../de/anomic/plasma/plasmaSnippetCache.java | 155 ++++++++---------- .../de/anomic/plasma/plasmaSwitchboard.java | 4 +- source/de/anomic/plasma/plasmaWordIndex.java | 18 +- 8 files changed, 106 insertions(+), 114 deletions(-) diff --git a/htroot/DetailedSearch.java b/htroot/DetailedSearch.java index cd25283fd..bb9ae7d5f 100644 --- a/htroot/DetailedSearch.java +++ b/htroot/DetailedSearch.java @@ -221,7 +221,7 @@ public class DetailedSearch { return prop; } final String delHash = post.get("deleteref", ""); - sb.wordIndex.removeReferences(query, delHash); + sb.wordIndex.removeWordReferences(query, delHash); } // prepare search order diff --git a/htroot/xml/snippet.java b/htroot/xml/snippet.java index 316b4ba79..22f5b3dd8 100644 --- a/htroot/xml/snippet.java +++ b/htroot/xml/snippet.java @@ -15,7 +15,6 @@ import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -import de.anomic.server.logging.serverLog; public class snippet { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws MalformedURLException { @@ -54,23 +53,20 @@ public class snippet { final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords); if (filtered.size() > 0) { kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords); - } + } // find snippet if (media.equals("text")) { // attach text snippet plasmaSnippetCache.TextSnippet snippet = switchboard.snippetCache.retrieveTextSnippet(url, queryHashes, true, pre, 260, textsnippet_timeout); - prop.put("status",snippet.getSource()); - if (snippet.getSource() < 11) { + prop.put("status",snippet.getErrorCode()); + if (snippet.getErrorCode() < 11) { + // no problems occurred //prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files } else { - String error = snippet.getError(); - if ((remove) && (error.equals("no matching snippet found"))) { - serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform()); - switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url)); - } - prop.put("text", error); + // problems with snippet fetch + prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, query) : snippet.getError()); } prop.put("link", 0); prop.put("links", 0); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 24003a452..4f8cd191f 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -210,7 +210,7 @@ public class yacysearch { // delete the index entry locally final String delHash = post.get("deleteref", ""); // urlhash - sb.wordIndex.removeReferences(query, delHash); + sb.wordIndex.removeWordReferences(query, delHash); // make new news message with negative voting HashMap map = new HashMap(); diff --git a/source/de/anomic/kelondro/kelondroMSetTools.java b/source/de/anomic/kelondro/kelondroMSetTools.java index 302ee339f..877baf023 100644 --- a/source/de/anomic/kelondro/kelondroMSetTools.java +++ b/source/de/anomic/kelondro/kelondroMSetTools.java @@ -423,6 +423,16 @@ public class kelondroMSetTools { return list; } + public static String setToString(Set set, char separator) { + Iterator i = set.iterator(); + StringBuffer sb = new StringBuffer(set.size() * 7); + if (i.hasNext()) sb.append(i.next().toString()); + while (i.hasNext()) { + sb.append(separator).append(i.next().toString()); + } + return new String(sb); + } + // ------------------------------------------------------------------------------------------------ diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 4fb9d26d0..a92895811 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -178,19 +178,6 @@ public final class plasmaSearchQuery { return result.toString(); } - /* - public String hashes(String separator) { - StringBuffer result = new StringBuffer(8 * queryHashes.size()); - Iterator i = queryHashes.iterator(); - if (i.hasNext()) result.append((String) i.next()); - while (i.hasNext()) { - result.append(separator); - result.append((String) i.next()); - } - return result.toString(); - } - */ - public void filterOut(Set blueList) { // filter out words that appear in this set Iterator it = queryWords.iterator(); diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index ce20ef7d5..ab4c16ef2 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -62,6 +62,7 @@ import de.anomic.http.httpHeader; import de.anomic.http.httpc; import de.anomic.plasma.plasmaURL; import de.anomic.kelondro.kelondroMScoreCluster; +import de.anomic.kelondro.kelondroMSetTools; import de.anomic.net.URL; import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.crawler.plasmaCrawlerException; @@ -109,13 +110,20 @@ public class plasmaSnippetCache { } public class TextSnippet { + private URL url; private String line; private String error; - private int source; - public TextSnippet(String line, int source, String errortext) { + private int errorCode; + private Set remaingHashes; + public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) { + this.url = url; this.line = line; - this.source = source; + this.errorCode = errorCode; this.error = errortext; + this.remaingHashes = remaingHashes; + } + public URL getUrl() { + return this.url; } public boolean exists() { return line != null; @@ -129,6 +137,12 @@ public class plasmaSnippetCache { public String getError() { return (error == null) ? "" : error.trim(); } + public int getErrorCode() { + return errorCode; + } + public Set getRemainingHashes() { + return this.remaingHashes; + } public String getLineMarked(Set queryHashes) { if (line == null) return ""; if ((queryHashes == null) || (queryHashes.size() == 0)) return line.trim(); @@ -199,9 +213,6 @@ public class plasmaSnippetCache { } return l.toString().trim(); } - public int getSource() { - return source; - } } public class MediaSnippet { @@ -225,7 +236,7 @@ public class plasmaSnippetCache { // heise = "0OQUNU3JSs05" if (queryhashes.size() == 0) { //System.out.println("found no queryhashes for URL retrieve " + url); - return new TextSnippet(null, ERROR_NO_HASH_GIVEN, "no query hashes given"); + return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given"); } String urlhash = plasmaURL.urlHash(url); @@ -235,7 +246,7 @@ public class plasmaSnippetCache { String line = retrieveFromCache(wordhashes, urlhash); if (line != null) { //System.out.println("found snippet for URL " + url + " in cache: " + line); - return new TextSnippet(line, source, null); + return new TextSnippet(url, line, source, null, null); } /* =========================================================================== @@ -273,15 +284,15 @@ public class plasmaSnippetCache { } // if it is still not available, report an error - if (resContent == null) return new TextSnippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL"); + if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource, plasmaHTCache.Entry cache is NULL"); source = SOURCE_WEB; } else { - return new TextSnippet(null, ERROR_SOURCE_LOADING, "no resource available"); + return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "no resource available"); } } catch (Exception e) { if (!(e instanceof plasmaCrawlerException)) e.printStackTrace(); - return new TextSnippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage()); + return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "error loading resource: " + e.getMessage()); } /* =========================================================================== @@ -291,11 +302,11 @@ public class plasmaSnippetCache { try { document = parseDocument(url, resContentLength, resContent, resInfo); } catch (ParserException e) { - return new TextSnippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed + return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed } finally { try { resContent.close(); } catch (Exception e) {/* ignore this */} } - if (document == null) return new TextSnippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed + if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed /* =========================================================================== @@ -305,8 +316,10 @@ public class plasmaSnippetCache { // compute snippet from text final Iterator sentences = document.getSentences(pre); - if (sentences == null) return new TextSnippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences"); - String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength); + if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences"); + Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength); + String textline = (tsr == null) ? null : (String) tsr[0]; + Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1]; // compute snippet from media String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); @@ -322,13 +335,13 @@ public class plasmaSnippetCache { //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline; if (textline != null) line += (line.length() == 0) ? textline : "
" + textline; - if ((line == null) || (line.length() < 3 /*snippetMinLength*/)) return new TextSnippet(null, ERROR_NO_MATCH, "no matching snippet found"); + if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found"); if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); // finally store this snippet in our own cache storeToCache(wordhashes, urlhash, line); document.close(); - return new TextSnippet(line, source, null); + return new TextSnippet(url, line, source, null, null); } /** @@ -458,34 +471,25 @@ public class plasmaSnippetCache { return result.substring(6); } - private String computeTextSnippet(Iterator sentences, Set queryhashes, int minLength, int maxLength) { + private Object[] /*{String - the snippet, Set - remaining hashes}*/ + computeTextSnippet(Iterator sentences, Set queryhashes, int maxLength) { try { if (sentences == null) return null; if ((queryhashes == null) || (queryhashes.size() == 0)) return null; Iterator j; HashMap hs; - String hash; StringBuffer sentence; TreeMap os = new TreeMap(); int uniqCounter = 9999; int score; while (sentences.hasNext()) { sentence = (StringBuffer) sentences.next(); - //System.out.println("Snippet-Sentence :" + sentence); // DEBUG - if (sentence.length() > minLength) { - hs = hashSentence(sentence.toString()); - j = queryhashes.iterator(); - score = 0; - while (j.hasNext()) { - hash = (String) j.next(); - if (hs.containsKey(hash)) { - //System.out.println("hash " + hash + " appears in line " + i); - score++; - } - } - if (score > 0) { - os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence); - } + hs = hashSentence(sentence.toString()); + j = queryhashes.iterator(); + score = 0; + while (j.hasNext()) {if (hs.containsKey((String) j.next())) score++;} + if (score > 0) { + os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence); } } @@ -493,21 +497,24 @@ public class plasmaSnippetCache { Set remaininghashes; while (os.size() > 0) { sentence = (StringBuffer) os.remove((Integer) os.lastKey()); // sentence with the biggest score - result = computeTextSnippet(sentence.toString(), queryhashes, minLength, maxLength); + Object[] tsr = computeTextSnippet(sentence.toString(), queryhashes, maxLength); + if (tsr == null) continue; + result = (String) tsr[0]; if ((result != null) && (result.length() > 0)) { - remaininghashes = removeAppearanceHashes(result, queryhashes); + remaininghashes = (Set) tsr[1]; if (remaininghashes.size() == 0) { // we have found the snippet - return result; + return new Object[]{result, remaininghashes}; } else if (remaininghashes.size() < queryhashes.size()) { // the result has not all words in it. // find another sentence that represents the missing other words // and find recursively more sentences maxLength = maxLength - result.length(); if (maxLength < 20) maxLength = 20; - String nextSnippet = computeTextSnippet(os.values().iterator(), remaininghashes, minLength / 2, maxLength); - if ((nextSnippet == null) || (nextSnippet.length() < (minLength / 2))) return null; // no success - return result + (" / " + nextSnippet); + tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength); + String nextSnippet = (String) tsr[0]; + if (nextSnippet == null) return tsr; + return new Object[]{result + (" / " + nextSnippet), tsr[1]}; } else { // error //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'"; @@ -518,11 +525,12 @@ public class plasmaSnippetCache { return null; } catch (IndexOutOfBoundsException e) { log.logSevere("computeSnippet: error with string generation", e); - return ""; + return new Object[]{null, queryhashes}; } } - private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) { + private Object[] /*{String - the snippet, Set - remaining hashes}*/ + computeTextSnippet(String sentence, Set queryhashes, int maxLength) { try { if (sentence == null) return null; if ((queryhashes == null) || (queryhashes.size() == 0)) return null; @@ -535,10 +543,13 @@ public class plasmaSnippetCache { j = queryhashes.iterator(); Integer pos; int p, minpos = sentence.length(), maxpos = -1; + HashSet remainingHashes = new HashSet(); while (j.hasNext()) { hash = (String) j.next(); pos = (Integer) hs.get(hash); - if (pos != null) { + if (pos == null) { + remainingHashes.add(hash); + } else { p = pos.intValue(); if (p > maxpos) maxpos = p; if (p < minpos) minpos = p; @@ -579,7 +590,7 @@ public class plasmaSnippetCache { // trim sentence, 3rd step (cut in the middle) sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim(); } - return sentence; + return new Object[] {sentence, remainingHashes}; } catch (IndexOutOfBoundsException e) { log.logSevere("computeSnippet: error with string generation", e); return null; @@ -838,46 +849,24 @@ public class plasmaSnippetCache { return result; } - /* - public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) { - // fetch snippets - int i = 0; - indexURLEntry urlentry; - String urlstring; - long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; - while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) { - urlentry = acc.nextElement(); - indexURLEntry.Components comp = urlentry.comp(); - if (comp.url().getHost().endsWith(".yacyh")) continue; - urlstring = comp.url().toNormalform(); - if ((urlstring.matches(urlmask)) && - (!(existsInCache(comp.url(), queryhashes)))) { - new Fetcher(comp.url(), queryhashes, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), (int) maxTime).start(); - i++; - } - } - } - - public class Fetcher extends Thread { - URL url; - Set queryhashes; - int timeout; - boolean pre; - public Fetcher(URL url, Set queryhashes, boolean pre, int timeout) { - if (url.getHost().endsWith(".yacyh")) return; - this.url = url; - this.queryhashes = queryhashes; - this.timeout = timeout; - this.pre = pre; + + public String failConsequences(TextSnippet snippet, Set queryhashes) { + // problems with snippet fetch + String urlHash = plasmaURL.urlHash(snippet.getUrl()); + String querystring = kelondroMSetTools.setToString(snippet.getRemainingHashes(), ' '); + if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) || + (snippet.getErrorCode() == ERROR_RESOURCE_LOADING) || + (snippet.getErrorCode() == ERROR_PARSER_FAILED) || + (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) { + log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError()); + sb.wordIndex.loadedURL.remove(urlHash); + sb.wordIndex.removeHashReferences(queryhashes, urlHash); } - public void run() { - log.logFine("snippetFetcher: try to get URL " + url); - plasmaSnippetCache.TextSnippet snippet = retrieveTextSnippet(url, queryhashes, true, pre, 260, timeout); - if (snippet.line == null) - log.logFine("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error); - else - log.logFine("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source); + if (snippet.getErrorCode() == ERROR_NO_MATCH) { + log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError()); + sb.wordIndex.removeHashReferences(snippet.remaingHashes, urlHash); } + return snippet.getError(); } - */ + } \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index d3816746f..cba80353d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -2741,7 +2741,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser filename = comp.url().getFile(); if ((seed == null) || ((address = seed.getAddress()) == null)) { // seed is not known from here - wordIndex.removeReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash()); + wordIndex.removeWordReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash()); wordIndex.loadedURL.remove(urlentry.hash()); // clean up continue; // next result } @@ -2887,7 +2887,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // delete all word references int count = 0; - if (words != null) count = wordIndex.removeReferences(words, urlhash); + if (words != null) count = wordIndex.removeWordReferences(words, urlhash); // finally delete the url entry itself wordIndex.loadedURL.remove(urlhash); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 577a9fe76..b0d2ef52b 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -426,16 +426,26 @@ public final class plasmaWordIndex implements indexRI { return removed; } - public int removeReferences(Set words, String urlhash) { + public int removeWordReferences(Set words, String urlhash) { // sequentially delete all word references // returns number of deletions Iterator iter = words.iterator(); - String word; int count = 0; while (iter.hasNext()) { - word = (String) iter.next(); // delete the URL reference in this word index - if (removeEntry(plasmaCondenser.word2hash(word), urlhash)) count++; + if (removeEntry(plasmaCondenser.word2hash((String) iter.next()), urlhash)) count++; + } + return count; + } + + public int removeHashReferences(Set hashes, String urlhash) { + // sequentially delete all word references + // returns number of deletions + Iterator iter = hashes.iterator(); + int count = 0; + while (iter.hasNext()) { + // delete the URL reference in this word index + if (removeEntry((String) iter.next(), urlhash)) count++; } return count; }