From d6c85228a669b816f3c965e269798f4e433aa02c Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 23 Jun 2005 12:12:12 +0000
Subject: [PATCH] enhanced snippet computation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@319 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 build.properties                              |   2 +-
 .../de/anomic/plasma/plasmaSnippetCache.java  | 202 ++++++++++--------
 .../de/anomic/plasma/plasmaSwitchboard.java   |  21 +-
 .../anomic/plasma/plasmaWordIndexCache.java   |   2 +-
 source/de/anomic/yacy/yacyClient.java         |   2 +-
 5 files changed, 128 insertions(+), 101 deletions(-)

diff --git a/build.properties b/build.properties
index 1d0c85525..f0152f69e 100644
--- a/build.properties
+++ b/build.properties
@@ -3,7 +3,7 @@ javacSource=1.4
 javacTarget=1.4
 
 # Release Configuration
-releaseVersion=0.383
+releaseVersion=0.384
 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 708927b3f..5d824c3bf 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -56,6 +56,10 @@ import de.anomic.yacy.yacySearch;
 public class plasmaSnippetCache {
 
     private static final int maxCache = 500;
+    public static final int SOURCE_CACHE = 0;
+    public static final int SOURCE_FILE = 0;
+    public static final int SOURCE_WEB = 0;
+    
     
     private int                   snippetsScoreCounter;
     private kelondroMScoreCluster snippetsScore;
@@ -81,8 +85,73 @@ public class plasmaSnippetCache {
         this.snippetsCache = new HashMap();        
     }
     
+    public class result {
+        public String line;
+        public int source;
+        public result(String line, int source) {
+            this.line = line;
+            this.source = source;
+        }
+        public String toString() {
+            return line;
+        }
+    }
     
-    public synchronized void store(String wordhashes, String urlhash, String snippet) {
+    public result retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
+        if (queryhashes.size() == 0) {
+            //System.out.println("found no queryhashes for url retrieve " + url);
+            return null;
+        }
+        String urlhash = plasmaURL.urlHash(url);
+        
+        // try to get snippet from snippetCache
+        String wordhashes = yacySearch.set2string(queryhashes);
+        String line = retrieveFromCache(wordhashes, urlhash);
+        if (line != null) {
+            //System.out.println("found snippet for url " + url + " in cache: " + line);
+            return new result(line, SOURCE_CACHE);
+        }
+        
+        // if the snippet is not in the cache, we can try to get it from the htcache
+        byte[] resource = null;
+        int source = SOURCE_CACHE;
+        try {
+            resource = cacheManager.loadResource(url);
+            if ((fetchOnline) && (resource == null)) {
+                loadResourceFromWeb(url, 5000);
+                resource = cacheManager.loadResource(url);
+                source = SOURCE_WEB;
+            }
+        } catch (IOException e) {
+            return null;
+        }
+        if (resource == null) {
+            //System.out.println("cannot load document for url " + url);
+            return null;
+        }
+        plasmaParserDocument document = parseDocument(url, resource);
+        
+        if (document == null) return null; // cannot be parsed
+        //System.out.println("loaded document for url " + url);
+        String[] sentences = document.getSentences();
+        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
+        if ((sentences == null) || (sentences.length == 0)) {
+            //System.out.println("found no sentences in url " + url);
+            return null;
+        }
+
+        // we have found a parseable non-empty file: use the lines
+        line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120);
+        //System.out.println("loaded snippet for url " + url + ": " + line);
+        if (line == null) return null;
+        if (line.length() > 120) line = line.substring(0, 120);
+
+        // finally store this snippet in our own cache
+        storeToCache(wordhashes, urlhash, line);
+        return new result(line, source);
+    }
+    
+    public synchronized void storeToCache(String wordhashes, String urlhash, String snippet) {
         // generate key
         String key = urlhash + wordhashes;
 
@@ -108,83 +177,64 @@ public class plasmaSnippetCache {
         }
     }
     
-    private String retrieve(String wordhashes, String urlhash) {
+    private String retrieveFromCache(String wordhashes, String urlhash) {
         // generate key
         String key = urlhash + wordhashes;
         return (String) snippetsCache.get(key);
     }
     
-    public String retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
-        if (queryhashes.size() == 0) {
-            //System.out.println("found no queryhashes for url retrieve " + url);
-            return null;
-        }
-        String urlhash = plasmaURL.urlHash(url);
-        
-        // try to get snippet from snippetCache
-        String wordhashes = yacySearch.set2string(queryhashes);
-        String snippet = retrieve(wordhashes, urlhash);
-        if (snippet != null) {
-            //System.out.println("found snippet for url " + url + " in cache: " + snippet);
-            return snippet;
-        }
-        
-        // if the snippet is not in the cache, we can try to get it from the htcache
-        plasmaParserDocument document = getDocument(url, fetchOnline);
-        if (document == null) {
-            //System.out.println("cannot load document for url " + url);
-            return null;
-        }
-        //System.out.println("loaded document for url " + url);
-        String[] sentences = document.getSentences();
-        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
-        if ((sentences == null) || (sentences.length == 0)) {
-            //System.out.println("found no sentences in url " + url);
-            return null;
-        }
-
-        // we have found a parseable non-empty file: use the lines
-        TreeMap sentencematrix = hashMatrix(sentences);
-        Iterator i = queryhashes.iterator();
-        String hash;
+    private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
         kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
-        Iterator j;
-        Integer sentencenumber;
-        Map.Entry entry;
-        while (i.hasNext()) {
-            hash = (String) i.next();
-            j = sentencematrix.entrySet().iterator();
-            while (j.hasNext()) {
-                entry = (Map.Entry) j.next();
-                sentencenumber = (Integer) entry.getKey();
-                if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
+        Iterator i;
+        HashSet hs;
+        for (int j = 0; j < sentences.length; j++) {
+            if ((sentences[j].length() > minLength) && (sentences[j].length() < maxLength)) {
+                hs = hashSentence(sentences[j]);
+                i = queryhashes.iterator();
+                while (i.hasNext()) {
+                    if (hs.contains((String) i.next())) hitTable.incScore(new Integer(j));
+                }
             }
         }
         Integer maxLine = (Integer) hitTable.getMaxObject();
         if (maxLine == null) return null;
-        snippet = sentences[maxLine.intValue()];
-        //System.out.println("loaded snippet for url " + url + ": " + snippet);
-        if (snippet.length() > 120) snippet = snippet.substring(0, 120);
-
-        // finally store this snippet in our own cache
-        store(wordhashes, urlhash, snippet);
-        return snippet;
+        if (hitTable.getScore(maxLine) == 0) return null;
+        return sentences[maxLine.intValue()];
+    }
+   
+    private HashSet hashSentence(String sentence) {
+        HashSet set = new HashSet();
+        Enumeration words = plasmaCondenser.wordTokenizer(sentence);
+        while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
+        return set;
     }
+     
+    public plasmaParserDocument parseDocument(URL url, byte[] resource) {
+        if (resource == null) return null;
+        httpHeader header = null;
+        try {
+            header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
+        } catch (IOException e) {}
         
-    private TreeMap hashMatrix(String[] sentences) {
-        TreeMap map = new TreeMap();
-        HashSet set;
-        Enumeration words;
-        for (int i = 0; i < sentences.length; i++) {
-            set = new HashSet();
-            words = plasmaCondenser.wordTokenizer(sentences[i]);
-            while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
-            map.put(new Integer(i), set);
+        if (header == null) {
+            String filename = url.getFile();
+            int p = filename.lastIndexOf('.');
+            if ((p < 0) ||
+                ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
+                return parser.parseSource(url, "text/html", resource);
+            } else {
+                return null;
+            }
+        } else {
+            if (plasmaParser.supportedMimeTypesContains(header.mime())) {
+                return parser.parseSource(url, header.mime(), resource);
+            } else {
+                return null;
+            }
         }
-        return map;
     }
     
-    private byte[] getResource(URL url, boolean fetchOnline) {
+    public byte[] getResource(URL url, boolean fetchOnline) {
         // load the url as resource from the web
         try {
             //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
@@ -214,29 +264,5 @@ public class plasmaSnippetCache {
             log);
     }
     
-    public plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
-        byte[] resource = getResource(url, fetchOnline);
-        if (resource == null) return null;
-        httpHeader header = null;
-        try {
-            header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
-        } catch (IOException e) {}
-        
-        if (header == null) {
-            String filename = url.getFile();
-            int p = filename.lastIndexOf('.');
-            if ((p < 0) ||
-                ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
-                return parser.parseSource(url, "text/html", resource);
-            } else {
-                return null;
-            }
-        } else {
-            if (plasmaParser.supportedMimeTypesContains(header.mime())) {
-                return parser.parseSource(url, header.mime(), resource);
-            } else {
-                return null;
-            }
-        }
-    }
+
 }
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 213ba6097..a533d7766 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -585,7 +585,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
     }
     
     public boolean coreCrawlJob() {
-        System.gc(); // debug
         if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
             //log.logDebug("CoreCrawl: queue is empty");
             return false;
@@ -1158,7 +1157,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                 // take some elements and fetch the snippets
                 int i = 0;
                 plasmaCrawlLURL.entry urlentry;
-                String urlstring, snippet;
+                String urlstring;
+                plasmaSnippetCache.result snippet;
                 while ((acc.hasMoreElements()) && (i < fetchcount)) {
                     urlentry = acc.nextElement();
                     if (urlentry.url().getHost().endsWith(".yacyh")) continue;
@@ -1166,7 +1166,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                     if (urlstring.matches(urlmask)) { //.* is default
                         log.logDebug("presearch: fetching URL " + urlstring);
 			snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes);
-                        if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet + "'");
+                        if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet.line + "'");
                         i++;
                     }
                 }
@@ -1237,8 +1237,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                 URL url;
                 plasmaCrawlLURL.entry urlentry;
                 String urlstring, urlname, filename;
-                String host, hash, address, snippet, descr = "";
+                String host, hash, address, descr = "";
                 yacySeed seed;
+                plasmaSnippetCache.result snippet;
                 //kelondroMScoreCluster ref = new kelondroMScoreCluster();
                 while ((acc.hasMoreElements()) && (i < count)) {
                     urlentry = acc.nextElement();
@@ -1284,12 +1285,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
 			prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
                         prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
                         snippet = snippetCache.retrieve(url, false, queryhashes);
-                        if ((snippet == null) || (snippet.length() < 10)) {
+                        if ((snippet == null) || (snippet.line.length() < 10)) {
                             prop.put("results_" + i + "_snippet", 0);
                             prop.put("results_" + i + "_snippet_text", "");
                         } else {
                             prop.put("results_" + i + "_snippet", 1);
-                            prop.put("results_" + i + "_snippet_text", snippet);
+                            prop.put("results_" + i + "_snippet_text", snippet.line);
                         }
                         i++;
                     }
@@ -1357,14 +1358,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                 String resource = "";
                 //plasmaIndexEntry pie;
                 plasmaCrawlLURL.entry urlentry;
-                String snippet;
+                plasmaSnippetCache.result snippet;
                 while ((acc.hasMoreElements()) && (i < count)) {
                     urlentry = acc.nextElement();
                     snippet = snippetCache.retrieve(urlentry.url(), false, hashes);
-                    if ((snippet == null) || (snippet.length() < 10)) {
+                    if ((snippet == null) || (snippet.line.length() < 10)) {
                         resource = urlentry.toString();
                     } else {
-                        resource = urlentry.toString(snippet);
+                        resource = urlentry.toString(snippet.line);
                     }
                     if (resource != null) {
                         links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString);
@@ -1433,7 +1434,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
         if (url == null) return 0;
         // get set of words
         //Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
-        Set words = plasmaCondenser.getWords(snippetCache.getDocument(url, fetchOnline).getText());
+        Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
         // delete all word references
         int count = removeReferences(urlhash, words);
         // finally delete the url entry itself
diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java
index e68b01cdf..2a655d241 100644
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@@ -209,7 +209,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
                     addEntry(wordHash, wordEntry, creationTime);
                     urlCount++;
                     // protect against memory shortage
-                    while (rt.freeMemory() < 1000000) {flushFromMem(); System.gc();}
+                    while (rt.freeMemory() < 1000000) flushFromMem();
                     // write a log
                     if (System.currentTimeMillis() > messageTime) {
                         System.gc(); // for better statistic
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index 703a2d22c..fe098a4fb 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -329,7 +329,7 @@ public class yacyClient {
                     // we don't store the snippets along the url entry, because they are search-specific.
                     // instead, they are placed in a snipped-search cache.
                     //System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'");
-                    snippets.store(wordhashes, link.hash(), link.snippet());
+                    snippets.storeToCache(wordhashes, link.hash(), link.snippet());
                 }
                 // add the url entry to the word indexes
                 for (int m = 0; m < words; m++) {