added snippet-routines (not yet finished)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@218 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · ca3b4ccaf4
parent 9244b6ad6f
commit ca3b4ccaf4
8 changed files with 376 additions and 159 deletions
--- a/htroot/Status.java
+++ b/htroot/Status.java
@ -181,7 +181,7 @@ public class Status {
        try {
            long mins = Long.parseLong(minsAsString);
            
-            StringBuilder uptime = new StringBuilder();
+            StringBuffer uptime = new StringBuffer();
            
            int uptimeDays  = (int) (Math.floor(mins/1440));
            int uptimeHours = (int) (Math.floor(mins/60)%24);
@ -189,7 +189,7 @@ public class Status {
            
            uptime.append(uptimeDays)
                  .append(((uptimeDays == 1)?" day ":" days "))
-                   .append((uptimeHours < 10)?"0":"")
+                  .append((uptimeHours < 10)?"0":"")
                  .append(uptimeHours)
                  .append(":")
                  .append((uptimeMins < 10)?"0":"")
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@ -333,6 +333,7 @@ public class plasmaCrawlLURL extends plasmaURL {
 	private char   doctype;
 	private long   size;
 	private int    wordCount;
+        private String snippet;

 	public entry(URL url, String descr, Date moddate, Date loaddate,
 		     String referrerHash, int copyCount, boolean localNeed,
@ -351,6 +352,7 @@ public class plasmaCrawlLURL extends plasmaURL {
 	    this.doctype = doctype;
 	    this.size = size;
 	    this.wordCount = wordCount;
+            this.snippet = null;
 	    store();
 	}

@ -378,6 +380,7 @@ public class plasmaCrawlLURL extends plasmaURL {
 		    this.doctype = (char) entry[10][0];
 		    this.size = (long) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[11]));
 		    this.wordCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[12]));
+                    this.snippet = null;
 		    return;
 		}
 	    } catch (Exception e) {
@ -409,6 +412,8 @@ public class plasmaCrawlLURL extends plasmaURL {
 		    this.doctype = prop.getProperty("dt", "t").charAt(0);
 		    this.size = Long.parseLong(prop.getProperty("size", "0"));
 		    this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
+                    this.snippet = prop.getProperty("snippet", "");
+                    if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
 		    store();
 		    //}
 	    } catch (Exception e) {
@ -507,6 +512,12 @@ public class plasmaCrawlLURL extends plasmaURL {
 	    return wordCount;
 	}

+        public String snippet() {
+            // the snippet may appear here if the url was transported in a remote search
+            // it will not be saved anywhere, but can only be requested here
+            return snippet;
+        }
+        
        private String corePropList() {
            // generate a parseable string; this is a simple property-list
            try {
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@ -83,7 +83,7 @@ public final class plasmaHTCache {
    public  long currCacheSize;
    public  long maxCacheSize;
    private long lastAcc;
-    private final File cachePath;
+    public  final File cachePath;
    public  static serverLog log;

    public static final int CACHE_UNFILLED          = 0; // default case without assignment
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -0,0 +1,234 @@
+// plasmaSnippetCache.java
+// -----------------------
+// part of YaCy
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2005
+// last major change: 07.06.2005
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+
+package de.anomic.plasma;
+
+import java.util.*;
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.kelondro.kelondroMScoreCluster;
+import de.anomic.server.serverFileUtils;
+import de.anomic.server.serverLog;
+import de.anomic.http.httpHeader;
+import de.anomic.yacy.yacySearch;
+
+public class plasmaSnippetCache {
+
+    private static final int maxCache = 500;
+    
+    private int                   snippetsScoreCounter;
+    private kelondroMScoreCluster snippetsScore;
+    private HashMap               snippetsCache;
+    private plasmaHTCache         cacheManager;
+    private plasmaParser          parser;
+    private serverLog             log;
+    private String                remoteProxyHost;
+    private int                   remoteProxyPort;
+    private boolean               remoteProxyUse;
+    
+    public plasmaSnippetCache(plasmaHTCache cacheManager, plasmaParser parser,
+                              String remoteProxyHost, int remoteProxyPort, boolean remoteProxyUse,
+                              serverLog log) {
+        this.cacheManager = cacheManager;
+        this.parser = parser;
+        this.log = log;
+        this.remoteProxyHost = remoteProxyHost;
+        this.remoteProxyPort = remoteProxyPort;
+        this.remoteProxyUse = remoteProxyUse;
+        this.snippetsScoreCounter = 0;
+        this.snippetsScore = new kelondroMScoreCluster();
+        this.snippetsCache = new HashMap();        
+    }
+    
+    
+    public synchronized void store(String wordhashes, String urlhash, String snippet) {
+        // generate key
+        String key = urlhash + wordhashes;
+
+        // do nothing if snippet is known
+        if (snippetsCache.containsKey(key)) return;
+
+        // learn new snippet
+        snippetsScore.addScore(key, snippetsScoreCounter++);
+        snippetsCache.put(key, snippet);
+
+        // care for counter
+        if (snippetsScoreCounter == java.lang.Integer.MAX_VALUE) {
+            snippetsScoreCounter = 0;
+            snippetsScore = new kelondroMScoreCluster();
+            snippetsCache = new HashMap();
+        }
+        
+        // flush cache if cache is full
+        while (snippetsCache.size() > maxCache) {
+            key = (String) snippetsScore.getMinObject();
+            snippetsScore.deleteScore(key);
+            snippetsCache.remove(key);
+        }
+    }
+    
+    private String retrieve(String wordhashes, String urlhash) {
+        // generate key
+        String key = urlhash + wordhashes;
+        return (String) snippetsCache.get(key);
+    }
+    
+    public String retrieve(java.net.URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
+        if (query.size() == 0) return null;
+        if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
+        String urlhash = plasmaURL.urlHash(url);
+        
+        // try to get snippet from snippetCache
+        String wordhashes = yacySearch.set2string(query);
+        String snippet = retrieve(wordhashes, urlhash);
+        if (snippet != null) return snippet;
+        
+        // if the snippet is not in the cache, we can try to get it from the htcache
+        plasmaParserDocument document = getDocument(url, fetchOnline);
+        if (document == null) return null;
+        String[] sentences = document.getSentences();
+        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
+        if ((sentences == null) || (sentences.length == 0)) return null;
+
+        // we have found a parseable non-empty file: use the lines
+        TreeMap sentencematrix = hashMatrix(sentences);
+        Iterator i = query.iterator();
+        String hash;
+        kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
+        Iterator j;
+        Integer sentencenumber;
+        Map.Entry entry;
+        while (i.hasNext()) {
+            hash = (String) i.next();
+            j = sentencematrix.entrySet().iterator();
+            while (j.hasNext()) {
+                entry = (Map.Entry) j.next();
+                sentencenumber = (Integer) entry.getKey();
+                if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
+            }
+        }
+        Integer maxLine = (Integer) hitTable.getMaxObject();
+        if (maxLine == null) return null;
+        snippet = sentences[maxLine.intValue()];
+        if (snippet.length() > 140) return null;
+        
+        // finally store this snippet in our own cache
+        store(wordhashes, urlhash, snippet);
+        return snippet;
+    }
+        
+    private TreeMap hashMatrix(String[] sentences) {
+        TreeMap map = new TreeMap();
+        HashSet set;
+        Enumeration words;
+        for (int i = 0; i < sentences.length; i++) {
+            set = new HashSet();
+            words = plasmaCondenser.wordTokenizer(sentences[i]);
+            while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
+            map.put(new Integer(i), set);
+        }
+        return map;
+    }
+    
+    private byte[] getResource(URL url, boolean fetchOnline) {
+        // load the url as resource from the web
+        try {
+            //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
+            byte[] resource = getResourceFromCache(url);
+            if ((fetchOnline) && (resource == null)) {
+                loadResourceFromWeb(url, 5000);
+                resource = getResourceFromCache(url);
+            }
+            return resource;
+        } catch (IOException e) {
+            return null;
+        }
+    }
+    
+    private byte[] getResourceFromCache(URL url) {
+        // load the url as resource from the cache
+        String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
+        File cache = cacheManager.cachePath;
+        File f = new File(cache, path);
+        if (f.exists()) try {
+            return serverFileUtils.read(f);
+        } catch (IOException e) {
+            return null;
+        } else {
+            return null;
+        }
+    }
+    
+    private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
+        plasmaCrawlWorker.load(
+            url, 
+            null, 
+            null, 
+            0, 
+            null,
+            socketTimeout,
+            remoteProxyHost,
+            remoteProxyPort,
+            remoteProxyUse,
+            cacheManager,
+            log);
+    }
+    
+    public plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
+        byte[] resource = getResource(url, fetchOnline);
+        if (resource == null) return null;
+        httpHeader header = null;
+        try {
+            header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
+        } catch (IOException e) {
+            return null;
+        }
+        if (header == null) return null;
+        if (plasmaParser.supportedMimeTypesContains(header.mime())) {
+            return parser.parseSource(url, header.mime(), resource);
+        } else {
+            return null;
+        }
+    }
+}
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -137,6 +137,7 @@ import de.anomic.server.serverCodings;
 import de.anomic.server.serverCore;
 import de.anomic.server.serverDate;
 import de.anomic.server.serverFileUtils;
+import de.anomic.server.serverThread;
 import de.anomic.server.serverInstantThread;
 import de.anomic.server.serverLog;
 import de.anomic.server.serverObjects;
@ -170,6 +171,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
    public  plasmaWordIndex        wordIndex;
    public  plasmaSearch           searchManager;
    public  plasmaHTCache          cacheManager;
+    public  plasmaSnippetCache     snippetCache;
    public  plasmaCrawlLoader      cacheLoader;
    public  LinkedList             processStack = new LinkedList();
    public  messageBoard           messageDB;
@ -216,6 +218,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            remoteProxyPort = 0;
        }
        
+        
        if (!(listsPath.exists())) listsPath.mkdirs();
        
 	// load coloured lists
@ -317,6 +320,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        long[] testresult = facilityDB.selectLong("statistik", "yyyyMMddHHm");
        testresult = facilityDB.selectLong("statistik", (new serverDate()).toShortString(false).substring(0, 11));
        
+        
+        // generate snippets cache
+        log.logSystem("Initializing Snippet Cache");
+        snippetCache = new plasmaSnippetCache(cacheManager, parser,
+                                              remoteProxyHost, remoteProxyPort, remoteProxyUse,
+                                              log);
+        
        // start yacy core
        log.logSystem("Starting YaCy Protocol Core");
        yacyCore yc = new yacyCore(this);
@ -328,6 +338,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                     new serverInstantThread(this, "cleanupJob", "cleanupJobSize"), 10000); // all 5 Minutes
        deployThread("80_dequeue", "Indexing Dequeue", "thread that creates database entries from scraped web content and performes indexing" ,
                     new serverInstantThread(this, "deQueue", "queueSize"), 10000);
+        setConfig("81_dequeue_idlesleep" , getConfig("80_dequeue_idlesleep", ""));
+        setConfig("81_dequeue_busysleep" , getConfig("80_dequeue_busysleep", ""));
+        deployThread("81_dequeue", "Indexing Dequeue (second job, test run)", "thread that creates database entries from scraped web content and performes indexing" ,
+                     new serverInstantThread(this, "deQueue", "queueSize"), 11000);
        deployThread("70_cachemanager", "Proxy Cache Enqueue", "job takes new proxy files from RAM stack, stores them, and hands over to the Indexing Stack",
                     new serverInstantThread(cacheManager, "job", "size"), 10000);
        deployThread("62_remotetriggeredcrawl", "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer",
@ -986,36 +1000,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        if (u == null) return plasmaURL.dummyHash; else return u.toString();
    }
    
-    /*
-    private void processCrawlingX(plasmaCrawlNURL.entry urlEntry, String initiator) {
-        if (urlEntry.url() == null) return;
-        String profileHandle = urlEntry.profileHandle();
-        //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
-        plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
-        if (profile == null) {
-            log.logError("CRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
-            return;
-        }
-        log.logDebug("plasmaSwitchboard.processCrawling: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + 
-		     ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
-		     ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
-
-        boolean tryRemote = 
-            (profile.remoteIndexing()) &&
-            (urlEntry.depth() == profile.generalDepth()) && 
-            (urlEntry.initiator() != null) && (!(urlEntry.initiator().equals(plasmaURL.dummyHash)))  &&
-            ((yacyCore.seedDB.mySeed.isSenior()) ||
-             (yacyCore.seedDB.mySeed.isPrincipal())) ;
-                
-        if (tryRemote) {
-            boolean success = processRemoteCrawlTrigger(urlEntry);
-            if (!(success)) processLocalCrawling(urlEntry, profile);
-        } else {
-            processLocalCrawling(urlEntry, profile);
-        }
-    }
-    */
-    
    private boolean processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile) {
        // work off one Crawl stack entry
        if ((urlEntry == null) && (urlEntry.url() == null)) {
@ -1118,6 +1102,42 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
 	if (date == null) return ""; else return DateFormatter.format(date);
    }
    
+    public class presearch extends Thread {
+        Set queryhashes;
+        char[] order;
+        String urlmask;
+        long time;
+        public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask) {
+            this.queryhashes = queryhashes;
+            this.order = order;
+            this.urlmask = urlmask;
+            this.time = time;
+        }
+        public void run() {
+            try {
+                // search the database locally
+                plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, time);
+                plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, 3);
+                if (acc == null) return;
+                
+                // take some elements and fetch the snippets
+                int i = 0;
+                plasmaCrawlLURL.entry urlentry;
+                String urlstring;
+                while ((acc.hasMoreElements()) && (i < 3)) {
+                    urlentry = acc.nextElement();
+                    if (urlentry.url().getHost().endsWith(".yacyh")) continue;
+                    urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
+                    if (urlstring.matches(urlmask)) { //.* is default
+			snippetCache.retrieve(urlentry.url(), true, queryhashes, true);
+                        i++;
+                    }
+                }
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+    }
    
    public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) {
        
@ -1141,6 +1161,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            log.logInfo("INIT WORD SEARCH: " + gs + " - " + count + " links, " + (time / 1000) + " seconds");
            long timestamp = System.currentTimeMillis();
            
+            //Thread preselect = new presearch(querywords, order, time / 10, urlmask);
+            //preselect.start();
+            
            // do global fetching
            int globalresults = 0;
            if (global) {
@ -1148,7 +1171,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                int fetchpeers = ((int) time / 1000) * 3; // number of target peers; means 30 peers in 10 seconds
                long fetchtime = time * 7 / 10;           // time to waste
                if (fetchcount > count) fetchcount = count;
-                globalresults = yacySearch.search(querywords, loadedURL, searchManager, fetchcount, fetchpeers, fetchtime);
+                globalresults = yacySearch.searchHashes(queryhashes, loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime);
                log.logDebug("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
            }
            prop.put("globalresults", globalresults); // the result are written to the local DB
@ -1156,7 +1179,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            
            // now search locally (the global results should be now in the local db)
            long remainingTime = time - (System.currentTimeMillis() - timestamp);
-            plasmaWordIndexEntity idx = searchManager.searchWords(querywords, remainingTime * 8 / 10); // the search
+            plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, remainingTime * 8 / 10); // the search
            log.logDebug("SEARCH TIME AFTER FINDING " + idx.size() + " ELEMENTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
            
            remainingTime = time - (System.currentTimeMillis() - timestamp);
@ -1176,10 +1199,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                URL url;
                plasmaCrawlLURL.entry urlentry;
                String urlstring, urlname, filename;
-                String host, hash;
-                String descr = "";
+                String host, hash, address, snippet, descr = "";
                yacySeed seed;
-                String address;
                //kelondroMScoreCluster ref = new kelondroMScoreCluster();
                while ((acc.hasMoreElements()) && (i < count)) {
                    urlentry = acc.nextElement();
@ -1218,14 +1239,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                         */
                    //addScoreForked(ref, gs, descr.split(" "));
                    //addScoreForked(ref, gs, urlstring.split("/"));
-                    String snippet;
                    if (urlstring.matches(urlmask)) { //.* is default
 			prop.put("results_" + i + "_description", descr);
 			prop.put("results_" + i + "_url", urlstring); 
 			prop.put("results_" + i + "_urlname", urlname); 
 			prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
                        prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
-                        snippet = getSnippet(url, false, querywords, false);
+                        snippet = snippetCache.retrieve(url, false, querywords, false);
                        if ((snippet == null) || (snippet.length() < 10)) {
                            prop.put("results_" + i + "_snippet", 0);
                            prop.put("results_" + i + "_snippet_text", "");
@ -1302,7 +1322,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                String snippet;
                while ((acc.hasMoreElements()) && (i < count)) {
                    urlentry = acc.nextElement();
-                    snippet = getSnippet(urlentry.url(), false, hashes, true);
+                    snippet = snippetCache.retrieve(urlentry.url(), false, hashes, true);
                    if ((snippet == null) || (snippet.length() < 10)) {
                        resource = urlentry.toString();
                    } else {
@ -1375,7 +1395,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        if (url == null) return 0;
        // get set of words
        //Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
-        Set words = plasmaCondenser.getWords(getDocument(url, fetchOnline).getText());
+        Set words = plasmaCondenser.getWords(snippetCache.getDocument(url, fetchOnline).getText());
        // delete all word references
        int count = removeReferences(urlhash, words);
        // finally delete the url entry itself
@ -1401,112 +1421,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        }
        return count;
    }
-    
-    private byte[] getResource(URL url, boolean fetchOnline) {
-        // load the url as resource from the web
-        try {
-            //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
-            byte[] resource = getResourceFromCache(url);
-            if ((fetchOnline) && (resource == null)) {
-                loadResourceFromWeb(url, 5000);
-                resource = getResourceFromCache(url);
-            }
-            return resource;
-        } catch (IOException e) {
-            return null;
-        }
-    }
-    
-    private byte[] getResourceFromCache(URL url) {
-        // load the url as resource from the cache
-        String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
-        File cache = new File(getRootPath(), getConfig("proxyCache", "DATA/HTCACHE"));
-        File f = new File(cache, path);
-        if (f.exists()) try {
-            return serverFileUtils.read(f);
-        } catch (IOException e) {
-            return null;
-        } else {
-            return null;
-        }
-    }
-    
-    private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
-        plasmaCrawlWorker.load(
-            url, 
-            null, 
-            null, 
-            0, 
-            null,
-            socketTimeout,
-            remoteProxyHost,
-            remoteProxyPort,
-            remoteProxyUse,
-            cacheManager,
-            log);
-    }
-    
-    private plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
-        byte[] resource = getResource(url, fetchOnline);
-        if (resource == null) return null;
-        httpHeader header = null;
-        try {
-            header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
-        } catch (IOException e) {
-            return null;
-        }
-        if (header == null) return null;
-        if (plasmaParser.supportedMimeTypesContains(header.mime())) {
-            return parser.parseSource(url, header.mime(), resource);
-        } else {
-            return null;
-        }
-    }
-    
-    private String getSnippet(URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
-        if (query.size() == 0) return null;
-        plasmaParserDocument document = getDocument(url, fetchOnline);
-        if (document == null) return null;
-        String[] sentences = document.getSentences();
-        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
-        if ((sentences == null) || (sentences.length == 0)) return null;
-        TreeMap sentencematrix = hashMatrix(sentences);
-        if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
-        Iterator i = query.iterator();
-        String hash;
-        kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
-        Iterator j;
-        Integer sentencenumber;
-        Map.Entry entry;
-        while (i.hasNext()) {
-            hash = (String) i.next();
-            j = sentencematrix.entrySet().iterator();
-            while (j.hasNext()) {
-                entry = (Map.Entry) j.next();
-                sentencenumber = (Integer) entry.getKey();
-                if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
-            }
-        }
-        Integer maxLine = (Integer) hitTable.getMaxObject();
-        if (maxLine == null) return null;
-        String snippet = sentences[maxLine.intValue()];
-        if (snippet.length() > 140) return null;
-        return snippet;
-    }
-        
-    private TreeMap hashMatrix(String[] sentences) {
-        TreeMap map = new TreeMap();
-        HashSet set;
-        Enumeration words;
-        for (int i = 0; i < sentences.length; i++) {
-            set = new HashSet();
-            words = plasmaCondenser.wordTokenizer(sentences[i]);
-            while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
-            map.put(new Integer(i), set);
-        }
-        return map;
-    }
-    
+
    public class distributeIndex {
        // distributes parts of the index to other peers
        // stops as soon as an error occurrs
--- a/source/de/anomic/server/serverInstantThread.java
+++ b/source/de/anomic/server/serverInstantThread.java
@ -1,3 +1,42 @@
+// serverInstantThread.java
+// -----------------------
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2005
+// last major change: 14.03.2005
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.

 package de.anomic.server;

--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@ -56,6 +56,7 @@ import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaWordIndexEntity;
 import de.anomic.plasma.plasmaWordIndexEntry;
 import de.anomic.plasma.plasmaWordIndexEntryContainer;
+import de.anomic.plasma.plasmaSnippetCache;
 import de.anomic.server.serverCore;
 import de.anomic.server.serverObjects;
 import de.anomic.tools.crypt;
@ -215,8 +216,9 @@ public class yacyClient {
    }
    
    public static int search(String wordhashes, int count, boolean global,
-    yacySeed targetPeer, plasmaCrawlLURL urlManager, plasmaSearch searchManager,
-    long duetime) {
+                             yacySeed targetPeer, plasmaCrawlLURL urlManager,
+                             plasmaSearch searchManager, plasmaSnippetCache snippets,
+                             long duetime) {
        // send a search request to peer with remote Hash
        // this mainly converts the words into word hashes
        
@ -294,10 +296,19 @@ public class yacyClient {
            
            // insert results to containers
            for (int n = 0; n < results; n++) {
+                // get one single search result
                link = urlManager.newEntry((String) result.get("resource" + n), true, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
+                // save the url entry
                plasmaWordIndexEntry entry = new plasmaWordIndexEntry(link.hash(), link.wordCount(), 0, 0, 0,
                                                                      plasmaSearch.calcVirtualAge(link.moddate()), link.quality(),
                                                                      link.language(), link.doctype(), false);
+                if (link.snippet() != null) {
+                    // we don't store the snippets along the url entry, because they are search-specific.
+                    // instead, they are placed in a snipped-search cache.
+                    //System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'");
+                    snippets.store(wordhashes, link.hash(), link.snippet());
+                }
+                // add the url entry to the word indexes
                for (int m = 0; m < words; m++) {
                    container[m].add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis());
                }
--- a/source/de/anomic/yacy/yacySearch.java
+++ b/source/de/anomic/yacy/yacySearch.java
@ -45,6 +45,7 @@ import java.util.Iterator;
 import java.util.Set;

 import de.anomic.kelondro.kelondroMScoreCluster;
+import de.anomic.plasma.plasmaSnippetCache;
 import de.anomic.plasma.plasmaCrawlLURL;
 import de.anomic.plasma.plasmaSearch;

@ -55,28 +56,27 @@ public class yacySearch extends Thread {
    private boolean global;
    private plasmaCrawlLURL urlManager;
    private plasmaSearch searchManager;
+    private plasmaSnippetCache snippetCache;
    private yacySeed targetPeer;
    private int links;
    private long duetime;

    public yacySearch(Set wordhashes, int count, boolean global, yacySeed targetPeer,
-		      plasmaCrawlLURL urlManager, plasmaSearch searchManager, long duetime) {
+		      plasmaCrawlLURL urlManager, plasmaSearch searchManager, plasmaSnippetCache snippetCache, long duetime) {
        super("yacySearch_" + targetPeer.getName());
        this.wordhashes = wordhashes;
        this.count = count;
        this.global = global;
        this.urlManager = urlManager;
        this.searchManager = searchManager;
+        this.snippetCache = snippetCache;
        this.targetPeer = targetPeer;
        this.links = -1;
        this.duetime = duetime;
    }

    public void run() {
-        String wh = "";
-        Iterator i = wordhashes.iterator();
-        while (i.hasNext()) wh = wh + (String) i.next();
-        this.links = yacyClient.search(wh, count, global, targetPeer, urlManager, searchManager, duetime);
+        this.links = yacyClient.search(set2string(wordhashes), count, global, targetPeer, urlManager, searchManager, snippetCache, duetime);
        if (links != 0) {
            //yacyCore.log.logInfo("REMOTE SEARCH - remote peer '" + targetPeer.get("Name", "anonymous") + "' contributed " + links + " links for word hash " + wordhashes);
            yacyCore.seedDB.mySeed.incRI(links);
@ -84,6 +84,13 @@ public class yacySearch extends Thread {
        }
    }
    
+    public static String set2string(Set hashes) {
+        String wh = "";
+        Iterator i = hashes.iterator();
+        while (i.hasNext()) wh = wh + (String) i.next();
+        return wh;
+    }
+    
    public int links() {
        return this.links;
    }
@ -119,8 +126,8 @@ public class yacySearch extends Thread {
 	return result;
    }
    
-    public static int search(Set querywords, plasmaCrawlLURL urlManager, plasmaSearch searchManager,
-			     int count, int targets, long waitingtime) {
+    public static int searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaSearch searchManager,
+			     int count, int targets, plasmaSnippetCache snippetCache, long waitingtime) {
        // check own peer status
        if ((yacyCore.seedDB.mySeed == null) || (yacyCore.seedDB.mySeed.getAddress() == null)) return 0;
        
@ -132,7 +139,7 @@ public class yacySearch extends Thread {
        if (duetime < 1000) duetime = 1000;
        
        // prepare seed targets and threads
-        Set wordhashes = plasmaSearch.words2hashes(querywords);
+        //Set wordhashes = plasmaSearch.words2hashes(querywords);
        yacySeed[] targetPeers = selectPeers(wordhashes, targets);
        if (targetPeers == null) return 0;
        targets = targetPeers.length;
@ -140,7 +147,7 @@ public class yacySearch extends Thread {
        yacySearch[] searchThreads = new yacySearch[targets];
        for (int i = 0; i < targets; i++) {
            searchThreads[i]= new yacySearch(wordhashes, count, true, targetPeers[i],
-                    urlManager, searchManager, duetime);
+                    urlManager, searchManager, snippetCache, duetime);
            searchThreads[i].start();
            try {Thread.currentThread().sleep(20);} catch (InterruptedException e) {}
            if ((System.currentTimeMillis() - start) > waitingtime) {