enhanced snippet-loading with threads

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@322 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · 3addf58046
parent 4afcf10158
commit 3addf58046
6 changed files with 126 additions and 58 deletions
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -128,6 +128,10 @@ public class CacheAdmin_p {
                        info += "<b>MEDIA:</b><br>" + formatAnchor(document.getMedialinks()) + "<br>";
                        info += "<b>EMAIL:</b><br>" + formatAnchor(document.getEmaillinks()) + "<br>";
                        info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>";
+                        info += "<b>LINES:</b><br><span class=\"small\">";
+                        String[] sentences = document.getSentences();
+                        for (int i = 0; i < sentences.length; i++) info += sentences + "<br>";
+                        info += "</span><br>";
                    }
                } catch (Exception e) {
                    info += e.toString();
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@ -314,7 +314,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
    }

    // string conversions
-    private static String code_iso8859s(byte c) {
+    private static String code_iso8859s(int c) {
 	switch ((int) c & 0xff) {
        
        // german umlaute and ligaturen
@ -361,7 +361,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
        String z;
 	for (int i = 0; i < bb.length(); i++) {
            b = bb.byteAt(i);
-	    z = code_iso8859s(b);
+	    z = code_iso8859s(b & 0xff);
            if (z == null) t.append(b); else t.append(z);
 	}
 	return t;
--- a/source/de/anomic/kelondro/kelondroMSetTools.java
+++ b/source/de/anomic/kelondro/kelondroMSetTools.java
@ -148,7 +148,7 @@ public class kelondroMSetTools {
    }
    
    // now the same for set-set
-        public static TreeSet joinConstructive(TreeSet set1, TreeSet set2) {
+    public static TreeSet joinConstructive(TreeSet set1, TreeSet set2) {
 	// comparators must be equal
        if ((set1 == null) || (set2 == null)) return null;
 	if (set1.comparator() != set2.comparator()) return null;
--- a/source/de/anomic/plasma/plasmaSearch.java
+++ b/source/de/anomic/plasma/plasmaSearch.java
@ -324,12 +324,12 @@ public final class plasmaSearch {
    
    public class result /*implements Enumeration*/ {
        
-        final TreeMap pageAcc;            // key = order hash; value = plasmaLURL.entry
-        final kelondroMScoreCluster ref;  // reference score computation for the commonSense heuristic
-        final Set searchhashes;           // hashes that are searched here
-        final Set stopwords;              // words that are excluded from the commonSense heuristic
-        final char[] order;               // order of heuristics
-        ArrayList results;                // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
+        TreeMap pageAcc;            // key = order hash; value = plasmaLURL.entry
+        kelondroMScoreCluster ref;  // reference score computation for the commonSense heuristic
+        Set searchhashes;           // hashes that are searched here
+        Set stopwords;              // words that are excluded from the commonSense heuristic
+        char[] order;               // order of heuristics
+        ArrayList results;          // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
        
        public result(Set searchhashes, Set stopwords, char[] order) {
            this.pageAcc = new TreeMap();
@ -340,6 +340,15 @@ public final class plasmaSearch {
            this.results = new ArrayList();
        }
        
+        public result cloneSmart() {
+            // clones only the top structure
+            result theClone = new result(this.searchhashes, this.stopwords, this.order);
+            theClone.pageAcc = (TreeMap) this.pageAcc.clone();
+            theClone.ref = this.ref;
+            theClone.results = this.results;
+            return theClone;
+        }
+        
        public int sizeOrdered() {
            return pageAcc.size();
        }
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -59,6 +59,7 @@ public class plasmaSnippetCache {
    public static final int SOURCE_CACHE = 0;
    public static final int SOURCE_FILE = 0;
    public static final int SOURCE_WEB = 0;
+    public static final int SOURCE_ERROR = 0;
    
    
    private int                   snippetsScoreCounter;
@ -87,20 +88,26 @@ public class plasmaSnippetCache {
    
    public class result {
        public String line;
+        public String error;
        public int source;
-        public result(String line, int source) {
+        public result(String line, int source, String error) {
            this.line = line;
            this.source = source;
+            this.error = error;
        }
        public String toString() {
            return line;
        }
    }
    
-    public result retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
+    public boolean existsInCache(URL url, Set queryhashes) {
+        return retrieveFromCache(yacySearch.set2string(queryhashes), plasmaURL.urlHash(url)) != null;
+    }
+    
+    public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
        if (queryhashes.size() == 0) {
            //System.out.println("found no queryhashes for url retrieve " + url);
-            return null;
+            return new result(null, SOURCE_ERROR, "no query hashes given");
        }
        String urlhash = plasmaURL.urlHash(url);
        
@ -109,7 +116,7 @@ public class plasmaSnippetCache {
        String line = retrieveFromCache(wordhashes, urlhash);
        if (line != null) {
            //System.out.println("found snippet for url " + url + " in cache: " + line);
-            return new result(line, SOURCE_CACHE);
+            return new result(line, SOURCE_CACHE, null);
        }
        
        // if the snippet is not in the cache, we can try to get it from the htcache
@ -123,32 +130,32 @@ public class plasmaSnippetCache {
                source = SOURCE_WEB;
            }
        } catch (IOException e) {
-            return null;
+            return new result(null, SOURCE_ERROR, "error loading resource from web: " + e.getMessage());
        }
        if (resource == null) {
            //System.out.println("cannot load document for url " + url);
-            return null;
+            return new result(null, SOURCE_ERROR, "error loading resource from web, cacheManager returned NULL");
        }
        plasmaParserDocument document = parseDocument(url, resource);
        
-        if (document == null) return null; // cannot be parsed
+        if (document == null) return new result(null, SOURCE_ERROR, "parser error/failed"); // cannot be parsed
        //System.out.println("loaded document for url " + url);
        String[] sentences = document.getSentences();
        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
        if ((sentences == null) || (sentences.length == 0)) {
            //System.out.println("found no sentences in url " + url);
-            return null;
+            return new result(null, SOURCE_ERROR, "parser returned no sentences");
        }

        // we have found a parseable non-empty file: use the lines
        line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120);
        //System.out.println("loaded snippet for url " + url + ": " + line);
-        if (line == null) return null;
+        if (line == null) return new result(null, SOURCE_ERROR, "no matching snippet found");
        if (line.length() > 120) line = line.substring(0, 120);

        // finally store this snippet in our own cache
        storeToCache(wordhashes, urlhash, line);
-        return new result(line, source);
+        return new result(line, source, null);
    }
    
    public synchronized void storeToCache(String wordhashes, String urlhash, String snippet) {
@ -184,24 +191,50 @@ public class plasmaSnippetCache {
    }
    
    private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
+        if ((sentences == null) || (sentences.length == 0)) return null;
+        if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
        kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
-        Iterator i;
+        Iterator j;
        HashSet hs;
-        for (int j = 0; j < sentences.length; j++) {
-            if ((sentences[j].length() > minLength) && (sentences[j].length() < maxLength)) {
-                hs = hashSentence(sentences[j]);
-                i = queryhashes.iterator();
-                while (i.hasNext()) {
-                    if (hs.contains((String) i.next())) hitTable.incScore(new Integer(j));
+        for (int i = 0; i < sentences.length; i++) {
+            if ((sentences[i].length() > minLength) && (sentences[i].length() < maxLength)) {
+                hs = hashSentence(sentences[i]);
+                j = queryhashes.iterator();
+                while (j.hasNext()) {
+                    if (hs.contains((String) j.next())) hitTable.incScore(new Integer(i));
                }
            }
        }
-        Integer maxLine = (Integer) hitTable.getMaxObject();
-        if (maxLine == null) return null;
-        if (hitTable.getScore(maxLine) == 0) return null;
-        return sentences[maxLine.intValue()];
+        int score = hitTable.getMaxScore(); // best number of hits
+        if (score <= 0) return null;
+        // we found (a) line(s) that have <score> hits.
+        // now find the shortest line of these hits
+        int shortLineIndex = -1;
+        int shortLineLength = Integer.MAX_VALUE;
+        for (int i = 0; i < sentences.length; i++) {
+            if ((hitTable.getScore(new Integer(i)) == score) &&
+                (sentences[i].length() < shortLineLength)) {
+                shortLineIndex = i;
+                shortLineLength = sentences[i].length();
+            }
+        }
+        // find a first result
+        String result = sentences[shortLineIndex];
+        if (score == queryhashes.size()) return result;
+        // the result has not all words in it.
+        // find another sentence that represents the missing other words
+        // first remove all words that appear in the result from the queryhashes
+        hs = hashSentence(result);
+        j = queryhashes.iterator();
+        while (j.hasNext()) {
+            if (hs.contains((String) j.next())) j.remove();
+        }
+        if (queryhashes.size() == 0) return result;
+        // now find recursively more sentences
+        String nextSnippet = computeSnippet(sentences, queryhashes, minLength, maxLength);
+        return result + ((nextSnippet == null) ? "" : (" ... " + nextSnippet));
    }
-   
+    
    private HashSet hashSentence(String sentence) {
        HashSet set = new HashSet();
        Enumeration words = plasmaCondenser.wordTokenizer(sentence);
@ -264,5 +297,4 @@ public class plasmaSnippetCache {
            log);
    }
    
-
 }
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1136,12 +1136,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        char[] order;
        String urlmask;
        long time;
-        int fetchcount;
-        public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int fetchcount) {
+        int searchcount, fetchcount;
+        public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int searchcount, int fetchcount) {
            this.queryhashes = queryhashes;
            this.order = order;
            this.urlmask = urlmask;
            this.time = time;
+            this.searchcount = searchcount;
            this.fetchcount = fetchcount;
        }
        public void run() {
@ -1150,26 +1151,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                log.logDebug("presearch: started job");
                plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, time);
                log.logDebug("presearch: found " + idx.size() + " results");
-                plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, fetchcount);
+                plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, searchcount);
                if (acc == null) return;
                log.logDebug("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch");
                
                // take some elements and fetch the snippets
-                int i = 0;
-                plasmaCrawlLURL.entry urlentry;
-                String urlstring;
-                plasmaSnippetCache.result snippet;
-                while ((acc.hasMoreElements()) && (i < fetchcount)) {
-                    urlentry = acc.nextElement();
-                    if (urlentry.url().getHost().endsWith(".yacyh")) continue;
-                    urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
-                    if (urlstring.matches(urlmask)) { //.* is default
-                        log.logDebug("presearch: fetching URL " + urlstring);
-			snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes);
-                        if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet.line + "'");
-                        i++;
-                    }
-                }
+                fetchSnippets(acc, queryhashes, urlmask, fetchcount);
            } catch (IOException e) {
                e.printStackTrace();
            }
@ -1177,6 +1164,42 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        }
    }
    
+    public void fetchSnippets(plasmaSearch.result acc, Set queryhashes, String urlmask, int fetchcount) {
+        // fetch the snippets
+        int i = 0;
+        plasmaCrawlLURL.entry urlentry;
+        String urlstring;
+        plasmaSnippetCache.result snippet;
+        while ((acc.hasMoreElements()) && (i < fetchcount)) {
+            urlentry = acc.nextElement();
+            if (urlentry.url().getHost().endsWith(".yacyh")) continue;
+            urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
+            if ((urlstring.matches(urlmask)) &&
+                (!(snippetCache.existsInCache(urlentry.url(), queryhashes)))) {
+                new snippetFetcher(urlentry.url(), queryhashes).start();
+                i++;
+            }
+        }
+    }
+        
+    public class snippetFetcher extends Thread {
+        URL url;
+        Set queryhashes;
+        public snippetFetcher(URL url, Set queryhashes) {
+            if (url.getHost().endsWith(".yacyh")) return;
+            this.url = url;
+            this.queryhashes = queryhashes;
+        }
+        public void run() {
+            log.logDebug("snippetFetcher: try to get URL " + url);
+            plasmaSnippetCache.result snippet = snippetCache.retrieve(url, queryhashes, true);
+            if (snippet.line == null)
+                log.logDebug("snippetFetcher: cannot get URL " + url + ". error: " + snippet.error);
+            else
+                log.logDebug("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source);
+        }
+    }
+    
    public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) {
        
        serverObjects prop = new serverObjects();
@ -1199,11 +1222,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            log.logInfo("INIT WORD SEARCH: " + gs + " - " + count + " links, " + (time / 1000) + " seconds");
            long timestamp = System.currentTimeMillis();
            
+            // start a presearch, which makes only sense if we idle afterwards.
+            // this is especially the case if we start a global search and idle until search
            if (global) {
-                // start a presearch, which makes only sense if we idle afterwards.
-                // this is especially the case if we start a global search and idle until search
-                // results appear from other peers
-                Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 5);
+                Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 10, 3);
                preselect.start();
            }
            
@ -1229,6 +1251,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            if (remainingTime < 500) remainingTime = 500;
            if (remainingTime > 3000) remainingTime = 3000;
            plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, remainingTime, 10);
+            if (!(global)) fetchSnippets(acc.cloneSmart(), queryhashes, urlmask, 10);
            log.logDebug("SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
            
            // result is a List of urlEntry elements: prepare answer
@ -1289,8 +1312,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
 			prop.put("results_" + i + "_urlname", urlname); 
 			prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
                        prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
-                        snippet = snippetCache.retrieve(url, false, queryhashes);
-                        if ((snippet == null) || (snippet.line.length() < 10)) {
+                        snippet = snippetCache.retrieve(url, queryhashes, false);
+                        if (snippet.line == null) {
                            prop.put("results_" + i + "_snippet", 0);
                            prop.put("results_" + i + "_snippet_text", "");
                        } else {
@ -1366,8 +1389,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                plasmaSnippetCache.result snippet;
                while ((acc.hasMoreElements()) && (i < count)) {
                    urlentry = acc.nextElement();
-                    snippet = snippetCache.retrieve(urlentry.url(), false, hashes);
-                    if ((snippet == null) || (snippet.line.length() < 10)) {
+                    snippet = snippetCache.retrieve(urlentry.url(), hashes, false);
+                    if (snippet.line == null) {
                        resource = urlentry.toString();
                    } else {
                        resource = urlentry.toString(snippet.line);