enhanced snippet computation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@319 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · d6c85228a6
parent d53b2393e5
commit d6c85228a6
5 changed files with 128 additions and 101 deletions
--- a/build.properties
+++ b/build.properties
@ -3,7 +3,7 @@ javacSource=1.4
 javacTarget=1.4

 # Release Configuration
-releaseVersion=0.383
+releaseVersion=0.384
 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -56,6 +56,10 @@ import de.anomic.yacy.yacySearch;
 public class plasmaSnippetCache {

    private static final int maxCache = 500;
+    public static final int SOURCE_CACHE = 0;
+    public static final int SOURCE_FILE = 0;
+    public static final int SOURCE_WEB = 0;
+    
    
    private int                   snippetsScoreCounter;
    private kelondroMScoreCluster snippetsScore;
@ -81,8 +85,73 @@ public class plasmaSnippetCache {
        this.snippetsCache = new HashMap();        
    }
    
+    public class result {
+        public String line;
+        public int source;
+        public result(String line, int source) {
+            this.line = line;
+            this.source = source;
+        }
+        public String toString() {
+            return line;
+        }
+    }
    
-    public synchronized void store(String wordhashes, String urlhash, String snippet) {
+    public result retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
+        if (queryhashes.size() == 0) {
+            //System.out.println("found no queryhashes for url retrieve " + url);
+            return null;
+        }
+        String urlhash = plasmaURL.urlHash(url);
+        
+        // try to get snippet from snippetCache
+        String wordhashes = yacySearch.set2string(queryhashes);
+        String line = retrieveFromCache(wordhashes, urlhash);
+        if (line != null) {
+            //System.out.println("found snippet for url " + url + " in cache: " + line);
+            return new result(line, SOURCE_CACHE);
+        }
+        
+        // if the snippet is not in the cache, we can try to get it from the htcache
+        byte[] resource = null;
+        int source = SOURCE_CACHE;
+        try {
+            resource = cacheManager.loadResource(url);
+            if ((fetchOnline) && (resource == null)) {
+                loadResourceFromWeb(url, 5000);
+                resource = cacheManager.loadResource(url);
+                source = SOURCE_WEB;
+            }
+        } catch (IOException e) {
+            return null;
+        }
+        if (resource == null) {
+            //System.out.println("cannot load document for url " + url);
+            return null;
+        }
+        plasmaParserDocument document = parseDocument(url, resource);
+        
+        if (document == null) return null; // cannot be parsed
+        //System.out.println("loaded document for url " + url);
+        String[] sentences = document.getSentences();
+        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
+        if ((sentences == null) || (sentences.length == 0)) {
+            //System.out.println("found no sentences in url " + url);
+            return null;
+        }
+
+        // we have found a parseable non-empty file: use the lines
+        line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120);
+        //System.out.println("loaded snippet for url " + url + ": " + line);
+        if (line == null) return null;
+        if (line.length() > 120) line = line.substring(0, 120);
+
+        // finally store this snippet in our own cache
+        storeToCache(wordhashes, urlhash, line);
+        return new result(line, source);
+    }
+    
+    public synchronized void storeToCache(String wordhashes, String urlhash, String snippet) {
        // generate key
        String key = urlhash + wordhashes;

@ -108,83 +177,64 @@ public class plasmaSnippetCache {
        }
    }
    
-    private String retrieve(String wordhashes, String urlhash) {
+    private String retrieveFromCache(String wordhashes, String urlhash) {
        // generate key
        String key = urlhash + wordhashes;
        return (String) snippetsCache.get(key);
    }
    
-    public String retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
-        if (queryhashes.size() == 0) {
-            //System.out.println("found no queryhashes for url retrieve " + url);
-            return null;
-        }
-        String urlhash = plasmaURL.urlHash(url);
-        
-        // try to get snippet from snippetCache
-        String wordhashes = yacySearch.set2string(queryhashes);
-        String snippet = retrieve(wordhashes, urlhash);
-        if (snippet != null) {
-            //System.out.println("found snippet for url " + url + " in cache: " + snippet);
-            return snippet;
-        }
-        
-        // if the snippet is not in the cache, we can try to get it from the htcache
-        plasmaParserDocument document = getDocument(url, fetchOnline);
-        if (document == null) {
-            //System.out.println("cannot load document for url " + url);
-            return null;
-        }
-        //System.out.println("loaded document for url " + url);
-        String[] sentences = document.getSentences();
-        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
-        if ((sentences == null) || (sentences.length == 0)) {
-            //System.out.println("found no sentences in url " + url);
-            return null;
-        }
-
-        // we have found a parseable non-empty file: use the lines
-        TreeMap sentencematrix = hashMatrix(sentences);
-        Iterator i = queryhashes.iterator();
-        String hash;
+    private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
        kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
-        Iterator j;
-        Integer sentencenumber;
-        Map.Entry entry;
-        while (i.hasNext()) {
-            hash = (String) i.next();
-            j = sentencematrix.entrySet().iterator();
-            while (j.hasNext()) {
-                entry = (Map.Entry) j.next();
-                sentencenumber = (Integer) entry.getKey();
-                if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
+        Iterator i;
+        HashSet hs;
+        for (int j = 0; j < sentences.length; j++) {
+            if ((sentences[j].length() > minLength) && (sentences[j].length() < maxLength)) {
+                hs = hashSentence(sentences[j]);
+                i = queryhashes.iterator();
+                while (i.hasNext()) {
+                    if (hs.contains((String) i.next())) hitTable.incScore(new Integer(j));
+                }
            }
        }
        Integer maxLine = (Integer) hitTable.getMaxObject();
        if (maxLine == null) return null;
-        snippet = sentences[maxLine.intValue()];
-        //System.out.println("loaded snippet for url " + url + ": " + snippet);
-        if (snippet.length() > 120) snippet = snippet.substring(0, 120);
-
-        // finally store this snippet in our own cache
-        store(wordhashes, urlhash, snippet);
-        return snippet;
+        if (hitTable.getScore(maxLine) == 0) return null;
+        return sentences[maxLine.intValue()];
+    }
+   
+    private HashSet hashSentence(String sentence) {
+        HashSet set = new HashSet();
+        Enumeration words = plasmaCondenser.wordTokenizer(sentence);
+        while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
+        return set;
    }
+     
+    public plasmaParserDocument parseDocument(URL url, byte[] resource) {
+        if (resource == null) return null;
+        httpHeader header = null;
+        try {
+            header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
+        } catch (IOException e) {}
        
-    private TreeMap hashMatrix(String[] sentences) {
-        TreeMap map = new TreeMap();
-        HashSet set;
-        Enumeration words;
-        for (int i = 0; i < sentences.length; i++) {
-            set = new HashSet();
-            words = plasmaCondenser.wordTokenizer(sentences[i]);
-            while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
-            map.put(new Integer(i), set);
+        if (header == null) {
+            String filename = url.getFile();
+            int p = filename.lastIndexOf('.');
+            if ((p < 0) ||
+                ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
+                return parser.parseSource(url, "text/html", resource);
+            } else {
+                return null;
+            }
+        } else {
+            if (plasmaParser.supportedMimeTypesContains(header.mime())) {
+                return parser.parseSource(url, header.mime(), resource);
+            } else {
+                return null;
+            }
        }
-        return map;
    }
    
-    private byte[] getResource(URL url, boolean fetchOnline) {
+    public byte[] getResource(URL url, boolean fetchOnline) {
        // load the url as resource from the web
        try {
            //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
@ -214,29 +264,5 @@ public class plasmaSnippetCache {
            log);
    }
    
-    public plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
-        byte[] resource = getResource(url, fetchOnline);
-        if (resource == null) return null;
-        httpHeader header = null;
-        try {
-            header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
-        } catch (IOException e) {}
-        
-        if (header == null) {
-            String filename = url.getFile();
-            int p = filename.lastIndexOf('.');
-            if ((p < 0) ||
-                ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
-                return parser.parseSource(url, "text/html", resource);
-            } else {
-                return null;
-            }
-        } else {
-            if (plasmaParser.supportedMimeTypesContains(header.mime())) {
-                return parser.parseSource(url, header.mime(), resource);
-            } else {
-                return null;
-            }
-        }
-    }
+
 }
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -585,7 +585,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
    }
    
    public boolean coreCrawlJob() {
-        System.gc(); // debug
        if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
            //log.logDebug("CoreCrawl: queue is empty");
            return false;
@ -1158,7 +1157,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                // take some elements and fetch the snippets
                int i = 0;
                plasmaCrawlLURL.entry urlentry;
-                String urlstring, snippet;
+                String urlstring;
+                plasmaSnippetCache.result snippet;
                while ((acc.hasMoreElements()) && (i < fetchcount)) {
                    urlentry = acc.nextElement();
                    if (urlentry.url().getHost().endsWith(".yacyh")) continue;
@ -1166,7 +1166,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                    if (urlstring.matches(urlmask)) { //.* is default
                        log.logDebug("presearch: fetching URL " + urlstring);
 			snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes);
-                        if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet + "'");
+                        if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet.line + "'");
                        i++;
                    }
                }
@ -1237,8 +1237,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                URL url;
                plasmaCrawlLURL.entry urlentry;
                String urlstring, urlname, filename;
-                String host, hash, address, snippet, descr = "";
+                String host, hash, address, descr = "";
                yacySeed seed;
+                plasmaSnippetCache.result snippet;
                //kelondroMScoreCluster ref = new kelondroMScoreCluster();
                while ((acc.hasMoreElements()) && (i < count)) {
                    urlentry = acc.nextElement();
@ -1284,12 +1285,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
 			prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
                        prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
                        snippet = snippetCache.retrieve(url, false, queryhashes);
-                        if ((snippet == null) || (snippet.length() < 10)) {
+                        if ((snippet == null) || (snippet.line.length() < 10)) {
                            prop.put("results_" + i + "_snippet", 0);
                            prop.put("results_" + i + "_snippet_text", "");
                        } else {
                            prop.put("results_" + i + "_snippet", 1);
-                            prop.put("results_" + i + "_snippet_text", snippet);
+                            prop.put("results_" + i + "_snippet_text", snippet.line);
                        }
                        i++;
                    }
@ -1357,14 +1358,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                String resource = "";
                //plasmaIndexEntry pie;
                plasmaCrawlLURL.entry urlentry;
-                String snippet;
+                plasmaSnippetCache.result snippet;
                while ((acc.hasMoreElements()) && (i < count)) {
                    urlentry = acc.nextElement();
                    snippet = snippetCache.retrieve(urlentry.url(), false, hashes);
-                    if ((snippet == null) || (snippet.length() < 10)) {
+                    if ((snippet == null) || (snippet.line.length() < 10)) {
                        resource = urlentry.toString();
                    } else {
-                        resource = urlentry.toString(snippet);
+                        resource = urlentry.toString(snippet.line);
                    }
                    if (resource != null) {
                        links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString);
@ -1433,7 +1434,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        if (url == null) return 0;
        // get set of words
        //Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
-        Set words = plasmaCondenser.getWords(snippetCache.getDocument(url, fetchOnline).getText());
+        Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
        // delete all word references
        int count = removeReferences(urlhash, words);
        // finally delete the url entry itself
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@ -209,7 +209,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
                    addEntry(wordHash, wordEntry, creationTime);
                    urlCount++;
                    // protect against memory shortage
-                    while (rt.freeMemory() < 1000000) {flushFromMem(); System.gc();}
+                    while (rt.freeMemory() < 1000000) flushFromMem();
                    // write a log
                    if (System.currentTimeMillis() > messageTime) {
                        System.gc(); // for better statistic
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@ -329,7 +329,7 @@ public class yacyClient {
                    // we don't store the snippets along the url entry, because they are search-specific.
                    // instead, they are placed in a snipped-search cache.
                    //System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'");
-                    snippets.store(wordhashes, link.hash(), link.snippet());
+                    snippets.storeToCache(wordhashes, link.hash(), link.snippet());
                }
                // add the url entry to the word indexes
                for (int m = 0; m < words; m++) {