From 9f929b5438cb5a39f186dfa81aa7a738e0d66988 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Tue, 13 Mar 2007 22:18:36 +0000
Subject: [PATCH] better snippet handling in case of snippet load fail see also
 http://www.yacy-forum.de/viewtopic.php?p=31096#31096

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3475 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/DetailedSearch.java                    |   2 +-
 htroot/xml/snippet.java                       |  16 +-
 htroot/yacysearch.java                        |   2 +-
 .../de/anomic/kelondro/kelondroMSetTools.java |  10 ++
 .../de/anomic/plasma/plasmaSearchQuery.java   |  13 --
 .../de/anomic/plasma/plasmaSnippetCache.java  | 155 ++++++++----------
 .../de/anomic/plasma/plasmaSwitchboard.java   |   4 +-
 source/de/anomic/plasma/plasmaWordIndex.java  |  18 +-
 8 files changed, 106 insertions(+), 114 deletions(-)

diff --git a/htroot/DetailedSearch.java b/htroot/DetailedSearch.java
index cd25283fd..bb9ae7d5f 100644
--- a/htroot/DetailedSearch.java
+++ b/htroot/DetailedSearch.java
@@ -221,7 +221,7 @@ public class DetailedSearch {
                 return prop;
             }
             final String delHash = post.get("deleteref", "");
-            sb.wordIndex.removeReferences(query, delHash);
+            sb.wordIndex.removeWordReferences(query, delHash);
         }
         
         // prepare search order
diff --git a/htroot/xml/snippet.java b/htroot/xml/snippet.java
index 316b4ba79..22f5b3dd8 100644
--- a/htroot/xml/snippet.java
+++ b/htroot/xml/snippet.java
@@ -15,7 +15,6 @@ import de.anomic.plasma.plasmaSnippetCache;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
-import de.anomic.server.logging.serverLog;
 
 public class snippet {
     public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws MalformedURLException {
@@ -54,23 +53,20 @@ public class snippet {
         final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
         if (filtered.size() > 0) {
             kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
-        }        
+        }
         
         // find snippet
         if (media.equals("text")) {
             // attach text snippet
             plasmaSnippetCache.TextSnippet snippet = switchboard.snippetCache.retrieveTextSnippet(url, queryHashes, true, pre, 260, textsnippet_timeout);
-            prop.put("status",snippet.getSource());
-            if (snippet.getSource() < 11) {
+            prop.put("status",snippet.getErrorCode());
+            if (snippet.getErrorCode() < 11) {
+                // no problems occurred
                 //prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
                 prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
             } else {
-                String error = snippet.getError();
-                if ((remove) && (error.equals("no matching snippet found"))) {
-                    serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform());
-                    switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url));
-                }
-                prop.put("text", error);
+                // problems with snippet fetch
+               prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, query) : snippet.getError());
             }
             prop.put("link", 0);
             prop.put("links", 0);
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 24003a452..4f8cd191f 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -210,7 +210,7 @@ public class yacysearch {
                 
             // delete the index entry locally
             final String delHash = post.get("deleteref", ""); // urlhash
-            sb.wordIndex.removeReferences(query, delHash);
+            sb.wordIndex.removeWordReferences(query, delHash);
 
             // make new news message with negative voting
             HashMap map = new HashMap();
diff --git a/source/de/anomic/kelondro/kelondroMSetTools.java b/source/de/anomic/kelondro/kelondroMSetTools.java
index 302ee339f..877baf023 100644
--- a/source/de/anomic/kelondro/kelondroMSetTools.java
+++ b/source/de/anomic/kelondro/kelondroMSetTools.java
@@ -423,6 +423,16 @@ public class kelondroMSetTools {
         return list;
     }
     
+    public static String setToString(Set set, char separator) {
+        Iterator i = set.iterator();
+        StringBuffer sb = new StringBuffer(set.size() * 7);
+        if (i.hasNext()) sb.append(i.next().toString());
+        while (i.hasNext()) {
+            sb.append(separator).append(i.next().toString());
+        }
+        return new String(sb);
+    }
+    
     // ------------------------------------------------------------------------------------------------
 
     
diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java
index 4fb9d26d0..a92895811 100644
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@@ -178,19 +178,6 @@ public final class plasmaSearchQuery {
     		return result.toString();
     }
     
-    /*
-    public String hashes(String separator) {
-		StringBuffer result = new StringBuffer(8 * queryHashes.size());
-		Iterator i = queryHashes.iterator();
-		if (i.hasNext()) result.append((String) i.next());
-		while (i.hasNext()) {
-			result.append(separator);
-			result.append((String) i.next());
-		}
-		return result.toString();
-    }
-   */
-    
     public void filterOut(Set blueList) {
         // filter out words that appear in this set
         Iterator it = queryWords.iterator();
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index ce20ef7d5..ab4c16ef2 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -62,6 +62,7 @@ import de.anomic.http.httpHeader;
 import de.anomic.http.httpc;
 import de.anomic.plasma.plasmaURL;
 import de.anomic.kelondro.kelondroMScoreCluster;
+import de.anomic.kelondro.kelondroMSetTools;
 import de.anomic.net.URL;
 import de.anomic.plasma.cache.IResourceInfo;
 import de.anomic.plasma.crawler.plasmaCrawlerException;
@@ -109,13 +110,20 @@ public class plasmaSnippetCache {
     }
     
     public class TextSnippet {
+        private URL url;
         private String line;
         private String error;
-        private int source;
-        public TextSnippet(String line, int source, String errortext) {
+        private int errorCode;
+        private Set remaingHashes;
+        public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) {
+            this.url = url;
             this.line = line;
-            this.source = source;
+            this.errorCode = errorCode;
             this.error = errortext;
+            this.remaingHashes = remaingHashes;
+        }
+        public URL getUrl() {
+            return this.url;
         }
         public boolean exists() {
             return line != null;
@@ -129,6 +137,12 @@ public class plasmaSnippetCache {
         public String getError() {
             return (error == null) ? "" : error.trim();
         }
+        public int getErrorCode() {
+            return errorCode;
+        }
+        public Set getRemainingHashes() {
+            return this.remaingHashes;
+        }
         public String getLineMarked(Set queryHashes) {
             if (line == null) return "";
             if ((queryHashes == null) || (queryHashes.size() == 0)) return line.trim();
@@ -199,9 +213,6 @@ public class plasmaSnippetCache {
             }
             return l.toString().trim();
         }
-        public int getSource() {
-            return source;
-        }
     }
     
     public class MediaSnippet {
@@ -225,7 +236,7 @@ public class plasmaSnippetCache {
         // heise = "0OQUNU3JSs05"
         if (queryhashes.size() == 0) {
             //System.out.println("found no queryhashes for URL retrieve " + url);
-            return new TextSnippet(null, ERROR_NO_HASH_GIVEN, "no query hashes given");
+            return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given");
         }
         String urlhash = plasmaURL.urlHash(url);
         
@@ -235,7 +246,7 @@ public class plasmaSnippetCache {
         String line = retrieveFromCache(wordhashes, urlhash);
         if (line != null) {
             //System.out.println("found snippet for URL " + url + " in cache: " + line);
-            return new TextSnippet(line, source, null);
+            return new TextSnippet(url, line, source, null, null);
         }
         
         /* ===========================================================================
@@ -273,15 +284,15 @@ public class plasmaSnippetCache {
                 }
                 
                 // if it is still not available, report an error
-                if (resContent == null) return new TextSnippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");                
+                if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource, plasmaHTCache.Entry cache is NULL");                
                 
                 source = SOURCE_WEB;
             } else {
-                return new TextSnippet(null, ERROR_SOURCE_LOADING, "no resource available");
+                return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "no resource available");
             }
         } catch (Exception e) {
             if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
-            return new TextSnippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
+            return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "error loading resource: " + e.getMessage());
         } 
 
         /* ===========================================================================
@@ -291,11 +302,11 @@ public class plasmaSnippetCache {
         try {
              document = parseDocument(url, resContentLength, resContent, resInfo);            
         } catch (ParserException e) {
-            return new TextSnippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
+            return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
         } finally {
             try { resContent.close(); } catch (Exception e) {/* ignore this */}
         }
-        if (document == null) return new TextSnippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
+        if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed
         
         
         /* ===========================================================================
@@ -305,8 +316,10 @@ public class plasmaSnippetCache {
 
         // compute snippet from text
         final Iterator sentences = document.getSentences(pre);
-        if (sentences == null) return new TextSnippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
-        String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
+        if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences");
+        Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
+        String textline = (tsr == null) ? null : (String) tsr[0];
+        Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1];
         
         // compute snippet from media
         String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
@@ -322,13 +335,13 @@ public class plasmaSnippetCache {
         //if (hrefline  != null) line += (line.length() == 0) ? hrefline  : "<br />" + hrefline;
         if (textline  != null) line += (line.length() == 0) ? textline  : "<br />" + textline;
         
-        if ((line == null) || (line.length() < 3 /*snippetMinLength*/)) return new TextSnippet(null, ERROR_NO_MATCH, "no matching snippet found");
+        if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found");
         if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
 
         // finally store this snippet in our own cache
         storeToCache(wordhashes, urlhash, line);
         document.close();
-        return new TextSnippet(line, source, null);
+        return new TextSnippet(url, line, source, null, null);
     }
 
     /**
@@ -458,34 +471,25 @@ public class plasmaSnippetCache {
         return result.substring(6);
     }
     
-    private String computeTextSnippet(Iterator sentences, Set queryhashes, int minLength, int maxLength) {
+    private Object[] /*{String - the snippet, Set - remaining hashes}*/
+            computeTextSnippet(Iterator sentences, Set queryhashes, int maxLength) {
         try {
             if (sentences == null) return null;
             if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
             Iterator j;
             HashMap hs;
-            String hash;
             StringBuffer sentence;
             TreeMap os = new TreeMap();
             int uniqCounter = 9999;
             int score;
             while (sentences.hasNext()) {
                 sentence = (StringBuffer) sentences.next();
-                //System.out.println("Snippet-Sentence :" + sentence); // DEBUG
-                if (sentence.length() > minLength) {
-                    hs = hashSentence(sentence.toString());
-                    j = queryhashes.iterator();
-                    score = 0;
-                    while (j.hasNext()) {
-                        hash = (String) j.next();
-                        if (hs.containsKey(hash)) {
-                            //System.out.println("hash " + hash + " appears in line " + i);
-                            score++;
-                        }
-                    }
-                    if (score > 0) {
-                        os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
-                    }
+                hs = hashSentence(sentence.toString());
+                j = queryhashes.iterator();
+                score = 0;
+                while (j.hasNext()) {if (hs.containsKey((String) j.next())) score++;}
+                if (score > 0) {
+                    os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
                 }
             }
             
@@ -493,21 +497,24 @@ public class plasmaSnippetCache {
             Set remaininghashes;
             while (os.size() > 0) {
                 sentence = (StringBuffer) os.remove((Integer) os.lastKey()); // sentence with the biggest score
-                result = computeTextSnippet(sentence.toString(), queryhashes, minLength, maxLength);
+                Object[] tsr = computeTextSnippet(sentence.toString(), queryhashes, maxLength);
+                if (tsr == null) continue;
+                result = (String) tsr[0];
                 if ((result != null) && (result.length() > 0)) {
-                    remaininghashes = removeAppearanceHashes(result, queryhashes);
+                    remaininghashes = (Set) tsr[1];
                     if (remaininghashes.size() == 0) {
                         // we have found the snippet
-                        return result;
+                        return new Object[]{result, remaininghashes};
                     } else if (remaininghashes.size() < queryhashes.size()) {
                         // the result has not all words in it.
                         // find another sentence that represents the missing other words
                         // and find recursively more sentences
                         maxLength = maxLength - result.length();
                         if (maxLength < 20) maxLength = 20;
-                        String nextSnippet = computeTextSnippet(os.values().iterator(), remaininghashes, minLength / 2, maxLength);
-                        if ((nextSnippet == null) || (nextSnippet.length() < (minLength / 2))) return null; // no success
-                        return result + (" / " + nextSnippet);
+                        tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength);
+                        String nextSnippet = (String) tsr[0];
+                        if (nextSnippet == null) return tsr;
+                        return new Object[]{result + (" / " + nextSnippet), tsr[1]};
                     } else {
                         // error
                         //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
@@ -518,11 +525,12 @@ public class plasmaSnippetCache {
             return null;
         } catch (IndexOutOfBoundsException e) {
             log.logSevere("computeSnippet: error with string generation", e);
-            return "";
+            return new Object[]{null, queryhashes};
         }
     }
     
-    private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) {
+    private Object[] /*{String - the snippet, Set - remaining hashes}*/
+            computeTextSnippet(String sentence, Set queryhashes, int maxLength) {
         try {
             if (sentence == null) return null;
             if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
@@ -535,10 +543,13 @@ public class plasmaSnippetCache {
             j = queryhashes.iterator();
             Integer pos;
             int p, minpos = sentence.length(), maxpos = -1;
+            HashSet remainingHashes = new HashSet();
             while (j.hasNext()) {
                 hash = (String) j.next();
                 pos = (Integer) hs.get(hash);
-                if (pos != null) {
+                if (pos == null) {
+                    remainingHashes.add(hash);
+                } else {
                     p = pos.intValue();
                     if (p > maxpos) maxpos = p;
                     if (p < minpos) minpos = p;
@@ -579,7 +590,7 @@ public class plasmaSnippetCache {
                 // trim sentence, 3rd step (cut in the middle)
                 sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
             }
-            return sentence;
+            return new Object[] {sentence, remainingHashes};
         } catch (IndexOutOfBoundsException e) {
             log.logSevere("computeSnippet: error with string generation", e);
             return null;
@@ -838,46 +849,24 @@ public class plasmaSnippetCache {
         
         return result;
     }
-    /*
-    public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
-        // fetch snippets
-        int i = 0;
-        indexURLEntry urlentry;
-        String urlstring;
-        long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
-        while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
-            urlentry = acc.nextElement();
-            indexURLEntry.Components comp = urlentry.comp();
-            if (comp.url().getHost().endsWith(".yacyh")) continue;
-            urlstring = comp.url().toNormalform();
-            if ((urlstring.matches(urlmask)) &&
-                (!(existsInCache(comp.url(), queryhashes)))) {
-                new Fetcher(comp.url(), queryhashes, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), (int) maxTime).start();
-                i++;
-            }
-        }
-    }
-        
-    public class Fetcher extends Thread {
-        URL url;
-        Set queryhashes;
-        int timeout;
-        boolean pre;
-        public Fetcher(URL url, Set queryhashes, boolean pre, int timeout) {
-            if (url.getHost().endsWith(".yacyh")) return;
-            this.url = url;
-            this.queryhashes = queryhashes;
-            this.timeout = timeout;
-            this.pre = pre;
+    
+    public String failConsequences(TextSnippet snippet, Set queryhashes) {
+        // problems with snippet fetch
+        String urlHash = plasmaURL.urlHash(snippet.getUrl());
+        String querystring = kelondroMSetTools.setToString(snippet.getRemainingHashes(), ' ');
+        if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) ||
+            (snippet.getErrorCode() == ERROR_RESOURCE_LOADING) ||
+            (snippet.getErrorCode() == ERROR_PARSER_FAILED) ||
+            (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
+            log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError());
+            sb.wordIndex.loadedURL.remove(urlHash);
+            sb.wordIndex.removeHashReferences(queryhashes, urlHash);
         }
-        public void run() {
-            log.logFine("snippetFetcher: try to get URL " + url);
-            plasmaSnippetCache.TextSnippet snippet = retrieveTextSnippet(url, queryhashes, true, pre, 260, timeout);
-            if (snippet.line == null)
-                log.logFine("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error);
-            else
-                log.logFine("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source);
+        if (snippet.getErrorCode() == ERROR_NO_MATCH) {
+            log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError());
+            sb.wordIndex.removeHashReferences(snippet.remaingHashes, urlHash);
         }
+        return snippet.getError();
     }
-    */
+    
 }
\ No newline at end of file
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index d3816746f..cba80353d 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -2741,7 +2741,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                         filename = comp.url().getFile();
                         if ((seed == null) || ((address = seed.getAddress()) == null)) {
                             // seed is not known from here
-                            wordIndex.removeReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash());
+                            wordIndex.removeWordReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash());
                             wordIndex.loadedURL.remove(urlentry.hash()); // clean up
                             continue; // next result
                         }
@@ -2887,7 +2887,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
             
             // delete all word references
             int count = 0;
-            if (words != null) count = wordIndex.removeReferences(words, urlhash);
+            if (words != null) count = wordIndex.removeWordReferences(words, urlhash);
             
             // finally delete the url entry itself
             wordIndex.loadedURL.remove(urlhash);
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 577a9fe76..b0d2ef52b 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -426,16 +426,26 @@ public final class plasmaWordIndex implements indexRI {
         return removed;
     }
     
-    public int removeReferences(Set words, String urlhash) {
+    public int removeWordReferences(Set words, String urlhash) {
         // sequentially delete all word references
         // returns number of deletions
         Iterator iter = words.iterator();
-        String word;
         int count = 0;
         while (iter.hasNext()) {
-            word = (String) iter.next();
             // delete the URL reference in this word index
-            if (removeEntry(plasmaCondenser.word2hash(word), urlhash)) count++;
+            if (removeEntry(plasmaCondenser.word2hash((String) iter.next()), urlhash)) count++;
+        }
+        return count;
+    }
+    
+    public int removeHashReferences(Set hashes, String urlhash) {
+        // sequentially delete all word references
+        // returns number of deletions
+        Iterator iter = hashes.iterator();
+        int count = 0;
+        while (iter.hasNext()) {
+            // delete the URL reference in this word index
+            if (removeEntry((String) iter.next(), urlhash)) count++;
         }
         return count;
     }