better snippet handling in case of snippet load fail

see also http://www.yacy-forum.de/viewtopic.php?p=31096#31096 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3475 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · 9f929b5438
parent d451ad48d3
commit 9f929b5438
8 changed files with 106 additions and 114 deletions
--- a/htroot/DetailedSearch.java
+++ b/htroot/DetailedSearch.java
@ -221,7 +221,7 @@ public class DetailedSearch {
                return prop;
            }
            final String delHash = post.get("deleteref", "");
-            sb.wordIndex.removeReferences(query, delHash);
+            sb.wordIndex.removeWordReferences(query, delHash);
        }
        
        // prepare search order
--- a/htroot/xml/snippet.java
+++ b/htroot/xml/snippet.java
@ -15,7 +15,6 @@ import de.anomic.plasma.plasmaSnippetCache;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
-import de.anomic.server.logging.serverLog;

 public class snippet {
    public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws MalformedURLException {
@ -54,23 +53,20 @@ public class snippet {
        final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
        if (filtered.size() > 0) {
            kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
-        }        
+        }
        
        // find snippet
        if (media.equals("text")) {
            // attach text snippet
            plasmaSnippetCache.TextSnippet snippet = switchboard.snippetCache.retrieveTextSnippet(url, queryHashes, true, pre, 260, textsnippet_timeout);
-            prop.put("status",snippet.getSource());
-            if (snippet.getSource() < 11) {
+            prop.put("status",snippet.getErrorCode());
+            if (snippet.getErrorCode() < 11) {
+                // no problems occurred
                //prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
                prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
            } else {
-                String error = snippet.getError();
-                if ((remove) && (error.equals("no matching snippet found"))) {
-                    serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform());
-                    switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url));
-                }
-                prop.put("text", error);
+                // problems with snippet fetch
+               prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, query) : snippet.getError());
            }
            prop.put("link", 0);
            prop.put("links", 0);
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -210,7 +210,7 @@ public class yacysearch {
                
            // delete the index entry locally
            final String delHash = post.get("deleteref", ""); // urlhash
-            sb.wordIndex.removeReferences(query, delHash);
+            sb.wordIndex.removeWordReferences(query, delHash);

            // make new news message with negative voting
            HashMap map = new HashMap();
--- a/source/de/anomic/kelondro/kelondroMSetTools.java
+++ b/source/de/anomic/kelondro/kelondroMSetTools.java
@ -423,6 +423,16 @@ public class kelondroMSetTools {
        return list;
    }
    
+    public static String setToString(Set set, char separator) {
+        Iterator i = set.iterator();
+        StringBuffer sb = new StringBuffer(set.size() * 7);
+        if (i.hasNext()) sb.append(i.next().toString());
+        while (i.hasNext()) {
+            sb.append(separator).append(i.next().toString());
+        }
+        return new String(sb);
+    }
+    
    // ------------------------------------------------------------------------------------------------

    
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@ -178,19 +178,6 @@ public final class plasmaSearchQuery {
    		return result.toString();
    }
    
-    /*
-    public String hashes(String separator) {
-		StringBuffer result = new StringBuffer(8 * queryHashes.size());
-		Iterator i = queryHashes.iterator();
-		if (i.hasNext()) result.append((String) i.next());
-		while (i.hasNext()) {
-			result.append(separator);
-			result.append((String) i.next());
-		}
-		return result.toString();
-    }
-   */
-    
    public void filterOut(Set blueList) {
        // filter out words that appear in this set
        Iterator it = queryWords.iterator();
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -62,6 +62,7 @@ import de.anomic.http.httpHeader;
 import de.anomic.http.httpc;
 import de.anomic.plasma.plasmaURL;
 import de.anomic.kelondro.kelondroMScoreCluster;
+import de.anomic.kelondro.kelondroMSetTools;
 import de.anomic.net.URL;
 import de.anomic.plasma.cache.IResourceInfo;
 import de.anomic.plasma.crawler.plasmaCrawlerException;
@ -109,13 +110,20 @@ public class plasmaSnippetCache {
    }
    
    public class TextSnippet {
+        private URL url;
        private String line;
        private String error;
-        private int source;
-        public TextSnippet(String line, int source, String errortext) {
+        private int errorCode;
+        private Set remaingHashes;
+        public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) {
+            this.url = url;
            this.line = line;
-            this.source = source;
+            this.errorCode = errorCode;
            this.error = errortext;
+            this.remaingHashes = remaingHashes;
+        }
+        public URL getUrl() {
+            return this.url;
        }
        public boolean exists() {
            return line != null;
@ -129,6 +137,12 @@ public class plasmaSnippetCache {
        public String getError() {
            return (error == null) ? "" : error.trim();
        }
+        public int getErrorCode() {
+            return errorCode;
+        }
+        public Set getRemainingHashes() {
+            return this.remaingHashes;
+        }
        public String getLineMarked(Set queryHashes) {
            if (line == null) return "";
            if ((queryHashes == null) || (queryHashes.size() == 0)) return line.trim();
@ -199,9 +213,6 @@ public class plasmaSnippetCache {
            }
            return l.toString().trim();
        }
-        public int getSource() {
-            return source;
-        }
    }
    
    public class MediaSnippet {
@ -225,7 +236,7 @@ public class plasmaSnippetCache {
        // heise = "0OQUNU3JSs05"
        if (queryhashes.size() == 0) {
            //System.out.println("found no queryhashes for URL retrieve " + url);
-            return new TextSnippet(null, ERROR_NO_HASH_GIVEN, "no query hashes given");
+            return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given");
        }
        String urlhash = plasmaURL.urlHash(url);
        
@ -235,7 +246,7 @@ public class plasmaSnippetCache {
        String line = retrieveFromCache(wordhashes, urlhash);
        if (line != null) {
            //System.out.println("found snippet for URL " + url + " in cache: " + line);
-            return new TextSnippet(line, source, null);
+            return new TextSnippet(url, line, source, null, null);
        }
        
        /* ===========================================================================
@ -273,15 +284,15 @@ public class plasmaSnippetCache {
                }
                
                // if it is still not available, report an error
-                if (resContent == null) return new TextSnippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");                
+                if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource, plasmaHTCache.Entry cache is NULL");                
                
                source = SOURCE_WEB;
            } else {
-                return new TextSnippet(null, ERROR_SOURCE_LOADING, "no resource available");
+                return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "no resource available");
            }
        } catch (Exception e) {
            if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
-            return new TextSnippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
+            return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "error loading resource: " + e.getMessage());
        } 

        /* ===========================================================================
@ -291,11 +302,11 @@ public class plasmaSnippetCache {
        try {
             document = parseDocument(url, resContentLength, resContent, resInfo);            
        } catch (ParserException e) {
-            return new TextSnippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
+            return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
        } finally {
            try { resContent.close(); } catch (Exception e) {/* ignore this */}
        }
-        if (document == null) return new TextSnippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
+        if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed
        
        
        /* ===========================================================================
@ -305,8 +316,10 @@ public class plasmaSnippetCache {

        // compute snippet from text
        final Iterator sentences = document.getSentences(pre);
-        if (sentences == null) return new TextSnippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
-        String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
+        if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences");
+        Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
+        String textline = (tsr == null) ? null : (String) tsr[0];
+        Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1];
        
        // compute snippet from media
        String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
@ -322,13 +335,13 @@ public class plasmaSnippetCache {
        //if (hrefline  != null) line += (line.length() == 0) ? hrefline  : "<br />" + hrefline;
        if (textline  != null) line += (line.length() == 0) ? textline  : "<br />" + textline;
        
-        if ((line == null) || (line.length() < 3 /*snippetMinLength*/)) return new TextSnippet(null, ERROR_NO_MATCH, "no matching snippet found");
+        if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found");
        if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);

        // finally store this snippet in our own cache
        storeToCache(wordhashes, urlhash, line);
        document.close();
-        return new TextSnippet(line, source, null);
+        return new TextSnippet(url, line, source, null, null);
    }

    /**
@ -458,34 +471,25 @@ public class plasmaSnippetCache {
        return result.substring(6);
    }
    
-    private String computeTextSnippet(Iterator sentences, Set queryhashes, int minLength, int maxLength) {
+    private Object[] /*{String - the snippet, Set - remaining hashes}*/
+            computeTextSnippet(Iterator sentences, Set queryhashes, int maxLength) {
        try {
            if (sentences == null) return null;
            if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
            Iterator j;
            HashMap hs;
-            String hash;
            StringBuffer sentence;
            TreeMap os = new TreeMap();
            int uniqCounter = 9999;
            int score;
            while (sentences.hasNext()) {
                sentence = (StringBuffer) sentences.next();
-                //System.out.println("Snippet-Sentence :" + sentence); // DEBUG
-                if (sentence.length() > minLength) {
-                    hs = hashSentence(sentence.toString());
-                    j = queryhashes.iterator();
-                    score = 0;
-                    while (j.hasNext()) {
-                        hash = (String) j.next();
-                        if (hs.containsKey(hash)) {
-                            //System.out.println("hash " + hash + " appears in line " + i);
-                            score++;
-                        }
-                    }
-                    if (score > 0) {
-                        os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
-                    }
+                hs = hashSentence(sentence.toString());
+                j = queryhashes.iterator();
+                score = 0;
+                while (j.hasNext()) {if (hs.containsKey((String) j.next())) score++;}
+                if (score > 0) {
+                    os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
                }
            }
            
@ -493,21 +497,24 @@ public class plasmaSnippetCache {
            Set remaininghashes;
            while (os.size() > 0) {
                sentence = (StringBuffer) os.remove((Integer) os.lastKey()); // sentence with the biggest score
-                result = computeTextSnippet(sentence.toString(), queryhashes, minLength, maxLength);
+                Object[] tsr = computeTextSnippet(sentence.toString(), queryhashes, maxLength);
+                if (tsr == null) continue;
+                result = (String) tsr[0];
                if ((result != null) && (result.length() > 0)) {
-                    remaininghashes = removeAppearanceHashes(result, queryhashes);
+                    remaininghashes = (Set) tsr[1];
                    if (remaininghashes.size() == 0) {
                        // we have found the snippet
-                        return result;
+                        return new Object[]{result, remaininghashes};
                    } else if (remaininghashes.size() < queryhashes.size()) {
                        // the result has not all words in it.
                        // find another sentence that represents the missing other words
                        // and find recursively more sentences
                        maxLength = maxLength - result.length();
                        if (maxLength < 20) maxLength = 20;
-                        String nextSnippet = computeTextSnippet(os.values().iterator(), remaininghashes, minLength / 2, maxLength);
-                        if ((nextSnippet == null) || (nextSnippet.length() < (minLength / 2))) return null; // no success
-                        return result + (" / " + nextSnippet);
+                        tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength);
+                        String nextSnippet = (String) tsr[0];
+                        if (nextSnippet == null) return tsr;
+                        return new Object[]{result + (" / " + nextSnippet), tsr[1]};
                    } else {
                        // error
                        //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
@ -518,11 +525,12 @@ public class plasmaSnippetCache {
            return null;
        } catch (IndexOutOfBoundsException e) {
            log.logSevere("computeSnippet: error with string generation", e);
-            return "";
+            return new Object[]{null, queryhashes};
        }
    }
    
-    private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) {
+    private Object[] /*{String - the snippet, Set - remaining hashes}*/
+            computeTextSnippet(String sentence, Set queryhashes, int maxLength) {
        try {
            if (sentence == null) return null;
            if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
@ -535,10 +543,13 @@ public class plasmaSnippetCache {
            j = queryhashes.iterator();
            Integer pos;
            int p, minpos = sentence.length(), maxpos = -1;
+            HashSet remainingHashes = new HashSet();
            while (j.hasNext()) {
                hash = (String) j.next();
                pos = (Integer) hs.get(hash);
-                if (pos != null) {
+                if (pos == null) {
+                    remainingHashes.add(hash);
+                } else {
                    p = pos.intValue();
                    if (p > maxpos) maxpos = p;
                    if (p < minpos) minpos = p;
@ -579,7 +590,7 @@ public class plasmaSnippetCache {
                // trim sentence, 3rd step (cut in the middle)
                sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
            }
-            return sentence;
+            return new Object[] {sentence, remainingHashes};
        } catch (IndexOutOfBoundsException e) {
            log.logSevere("computeSnippet: error with string generation", e);
            return null;
@ -838,46 +849,24 @@ public class plasmaSnippetCache {
        
        return result;
    }
-    /*
-    public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
-        // fetch snippets
-        int i = 0;
-        indexURLEntry urlentry;
-        String urlstring;
-        long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
-        while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
-            urlentry = acc.nextElement();
-            indexURLEntry.Components comp = urlentry.comp();
-            if (comp.url().getHost().endsWith(".yacyh")) continue;
-            urlstring = comp.url().toNormalform();
-            if ((urlstring.matches(urlmask)) &&
-                (!(existsInCache(comp.url(), queryhashes)))) {
-                new Fetcher(comp.url(), queryhashes, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), (int) maxTime).start();
-                i++;
-            }
-        }
-    }
-        
-    public class Fetcher extends Thread {
-        URL url;
-        Set queryhashes;
-        int timeout;
-        boolean pre;
-        public Fetcher(URL url, Set queryhashes, boolean pre, int timeout) {
-            if (url.getHost().endsWith(".yacyh")) return;
-            this.url = url;
-            this.queryhashes = queryhashes;
-            this.timeout = timeout;
-            this.pre = pre;
+    
+    public String failConsequences(TextSnippet snippet, Set queryhashes) {
+        // problems with snippet fetch
+        String urlHash = plasmaURL.urlHash(snippet.getUrl());
+        String querystring = kelondroMSetTools.setToString(snippet.getRemainingHashes(), ' ');
+        if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) ||
+            (snippet.getErrorCode() == ERROR_RESOURCE_LOADING) ||
+            (snippet.getErrorCode() == ERROR_PARSER_FAILED) ||
+            (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
+            log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError());
+            sb.wordIndex.loadedURL.remove(urlHash);
+            sb.wordIndex.removeHashReferences(queryhashes, urlHash);
        }
-        public void run() {
-            log.logFine("snippetFetcher: try to get URL " + url);
-            plasmaSnippetCache.TextSnippet snippet = retrieveTextSnippet(url, queryhashes, true, pre, 260, timeout);
-            if (snippet.line == null)
-                log.logFine("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error);
-            else
-                log.logFine("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source);
+        if (snippet.getErrorCode() == ERROR_NO_MATCH) {
+            log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError());
+            sb.wordIndex.removeHashReferences(snippet.remaingHashes, urlHash);
        }
+        return snippet.getError();
    }
-    */
+    
 }
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -2741,7 +2741,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                        filename = comp.url().getFile();
                        if ((seed == null) || ((address = seed.getAddress()) == null)) {
                            // seed is not known from here
-                            wordIndex.removeReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash());
+                            wordIndex.removeWordReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash());
                            wordIndex.loadedURL.remove(urlentry.hash()); // clean up
                            continue; // next result
                        }
@ -2887,7 +2887,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            
            // delete all word references
            int count = 0;
-            if (words != null) count = wordIndex.removeReferences(words, urlhash);
+            if (words != null) count = wordIndex.removeWordReferences(words, urlhash);
            
            // finally delete the url entry itself
            wordIndex.loadedURL.remove(urlhash);
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -426,16 +426,26 @@ public final class plasmaWordIndex implements indexRI {
        return removed;
    }
    
-    public int removeReferences(Set words, String urlhash) {
+    public int removeWordReferences(Set words, String urlhash) {
        // sequentially delete all word references
        // returns number of deletions
        Iterator iter = words.iterator();
-        String word;
        int count = 0;
        while (iter.hasNext()) {
-            word = (String) iter.next();
            // delete the URL reference in this word index
-            if (removeEntry(plasmaCondenser.word2hash(word), urlhash)) count++;
+            if (removeEntry(plasmaCondenser.word2hash((String) iter.next()), urlhash)) count++;
+        }
+        return count;
+    }
+    
+    public int removeHashReferences(Set hashes, String urlhash) {
+        // sequentially delete all word references
+        // returns number of deletions
+        Iterator iter = hashes.iterator();
+        int count = 0;
+        while (iter.hasNext()) {
+            // delete the URL reference in this word index
+            if (removeEntry((String) iter.next(), urlhash)) count++;
        }
        return count;
    }