- added correct flagging of word properties

- added self-healing to database in case that wrong free-pointers exist - added presentation of media links in snippets (does not yet work correctly) - code cleanup git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3055 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · bf0d820659
parent 10d888e70c
commit bf0d820659
12 changed files with 342 additions and 133 deletions
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@ -300,7 +300,7 @@ public class IndexControl_p {
                         "true".equalsIgnoreCase(gzipBody),
                         timeout);
            result = (String) resultObj.get("result");
-            prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
+            prop.put("result", (result == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries + " URL not found") : result);
            index = null;
        }
--- a/htroot/yacy/transferRWI.java
+++ b/htroot/yacy/transferRWI.java
@ -203,7 +203,7 @@ public final class transferRWI {
            }
            if (unknownURLs.length() > 0) { unknownURLs.delete(0, 1); }
            if ((wordhashes.length == 0) || (received == 0)) {
-                sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs");
+                sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs, blocked " + blocked + " RWIs");
            } else {
                final double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[received - 1])) / 2.0;
                sb.getLog().logInfo("Received " + received + " Entries " + wordc + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + "/" + receivedURL + " URLs, blocked " + blocked + " RWIs");
--- a/htroot/yacy/transferURL.java
+++ b/htroot/yacy/transferURL.java
@ -46,10 +46,12 @@
 // javac -classpath .:../classes transferRWI.java
 import java.io.IOException;
 import java.text.ParseException;
 import de.anomic.http.httpHeader;
 import de.anomic.index.indexURLEntry;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaURL;
 import de.anomic.plasma.urlPattern.plasmaURLPattern;
 import de.anomic.server.serverCore;
 import de.anomic.server.serverObjects;
@ -59,10 +61,13 @@ import de.anomic.yacy.yacySeed;
 public final class transferURL {
    public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException {
        if (post == null || ss == null) { return null; }
        long start = System.currentTimeMillis();
        long freshdate = 0;
        try {freshdate = plasmaURL.shortDayFormatter.parse("20061101").getTime();} catch (ParseException e1) {}
        // return variable that accumulates replacements
        final plasmaSwitchboard sb = (plasmaSwitchboard) ss;
@ -93,26 +98,39 @@ public final class transferURL {
            indexURLEntry lEntry;
            for (int i = 0; i < urlc; i++) {
                serverCore.checkInterruption();
                // read new lurl-entry
                urls = (String) post.get("url" + i);
                if (urls == null) {
                    yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
-                } else {
+                    continue;
                }
                // parse new lurl-entry
                lEntry = sb.wordIndex.loadedURL.newEntry(urls);
                if (lEntry == null) {
                    yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
-                        // TODO: should we send back an error message???
+                    continue;
-                    } else {
+                }
                // check if entry is well-formed
                indexURLEntry.Components comp = lEntry.comp();
-                        if (comp.url() == null) {
+                if ((comp.url() == null) || (lEntry.freshdate().getTime() <= freshdate)) {
-                            yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
+                    yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
-                            // TODO: should we send back an error message???
+                    continue;
-                        } else {
+                }
                // check if the entry is blacklisted
                if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
                    int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
                    yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
                    lEntry = null;
                    blocked++;
-                            } else try {
+                    continue;
                }
                // write entry to database
                try {
                    sb.wordIndex.loadedURL.store(lEntry);
                    sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
                    yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
@ -121,9 +139,6 @@ public final class transferURL {
                    e.printStackTrace();
                }
            }
                    }
                }
            }
            yacyCore.seedDB.mySeed.incRU(received);
--- a/source/de/anomic/index/indexRWIEntryNew.java
+++ b/source/de/anomic/index/indexRWIEntryNew.java
@ -117,7 +117,6 @@ public class indexRWIEntryNew  implements Cloneable, indexRWIEntry {
            int      sizeOfPage,    // # of bytes of the page TODO: not needed any more
            long     lastmodified,  // last-modified time of the document where word appears
            long     updatetime,    // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
            int      quality,       // the entropy value
            String   language,      // (guessed) language of document
            char     doctype,       // type of document
            int      outlinksSame,  // outlinks to same domain
--- a/source/de/anomic/index/indexRWIEntryOld.java
+++ b/source/de/anomic/index/indexRWIEntryOld.java
@ -33,7 +33,6 @@ import de.anomic.kelondro.kelondroColumn;
 import de.anomic.kelondro.kelondroRow;
 import de.anomic.kelondro.kelondroRow.Entry;
 import de.anomic.plasma.plasmaSearchQuery;
 import de.anomic.plasma.plasmaURL;
 import de.anomic.plasma.plasmaWordIndex;
 import de.anomic.yacy.yacySeedDB;
@ -66,7 +65,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
    private static final int col_hitcount     =  3;
    private static final int col_language     =  4;
    private static final int col_doctype      =  5;
-    private static final int col_localflag    =  6;
+    //private static final int col_localflag    =  6;
    private static final int col_posintext    =  7;
    private static final int col_posinphrase  =  8;
    private static final int col_posofphrase  =  9;
@ -77,6 +76,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
    private kelondroRow.Entry entry;
    /*
    public indexRWIEntryOld(String  urlHash,
            int     urlLength,    // byte-length of complete URL
            int     urlComps,     // number of path components
@ -91,7 +91,6 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
            int     sizeOfPage,   // # of bytes of the page
            long    lastmodified, //*last-modified time of the document where word appears
            long    updatetime,   // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
            int     quality,      //*the entropy value
            String  language,     //*(guessed) language of document
            char    doctype,      //*type of document
            int     outlinksSame, // outlinks to same domain
@ -107,7 +106,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
        if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
        this.entry = urlEntryRow.newEntry();
        this.entry.setCol(col_urlhash, urlHash, null);
-        this.entry.setCol(col_quality, quality);
+        this.entry.setCol(col_quality, 0);
        this.entry.setCol(col_lastModified, lastmodified);
        this.entry.setCol(col_hitcount, hitcount);
        this.entry.setCol(col_language, language, null);
@ -121,7 +120,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
        this.entry.setCol(col_phrasecount, phrasecount);
        //System.out.println("DEBUG-NEWENTRY " + toPropertyForm());
    }
-
+*/
    public indexRWIEntryOld(String urlHash, String code) {
        // the code is the external form of the row minus the leading urlHash entry
        this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
--- a/source/de/anomic/kelondro/kelondroBitfield.java
+++ b/source/de/anomic/kelondro/kelondroBitfield.java
@ -24,7 +24,7 @@
 package de.anomic.kelondro;
-public class kelondroBitfield {
+public class kelondroBitfield implements Cloneable {
    // the bitfield implements a binary array. Such arrays may be exported in a base64-String
@ -55,6 +55,12 @@ public class kelondroBitfield {
        }
    }
    public Object clone() {
        kelondroBitfield theClone = new kelondroBitfield(new byte[this.bb.length]);
        System.arraycopy(this.bb, 0, theClone.bb, 0, this.bb.length);
        return theClone;
    }
    public void set(int pos, boolean value) {
        assert (pos >= 0);
        int slot = pos / 8;
--- a/source/de/anomic/kelondro/kelondroRecords.java
+++ b/source/de/anomic/kelondro/kelondroRecords.java
@ -1392,7 +1392,7 @@ public class kelondroRecords {
                    USAGE.FREEC--;
                    // take link
                    if (USAGE.FREEH.index == NUL) {
-                        System.out.println("INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records. Affected file: " + filename);
+                        serverLog.logSevere("kelondroRecords/" + filename, "INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records.");
                        // try to heal..
                        USAGE.USEDC = USAGE.allCount() + 1;
                        USAGE.FREEC = 0;
@ -1402,11 +1402,18 @@ public class kelondroRecords {
                        //System.out.println("*DEBUG* ALLOCATED DELETED INDEX " + index);
                        // check for valid seek position
                        long seekp = seekpos(USAGE.FREEH);
-                        if (seekp > entryFile.length()) throw new kelondroException("new Handle: seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
+                        if (seekp > entryFile.length()) {
-                        
+                            // this is a severe inconsistency. try to heal..
                            serverLog.logSevere("kelondroRecords/" + filename, "new Handle: lost " + USAGE.FREEC + " marked nodes; seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
                            index = USAGE.allCount(); // a place at the end of the file
                            USAGE.USEDC += USAGE.FREEC; // to avoid that non-empty records at the end are overwritten
                            USAGE.FREEC = 0; // discard all possible empty nodes
                            USAGE.FREEH.index = NUL;
                        } else {
                            // read link to next element of FREEH chain
                            USAGE.FREEH.index = entryFile.readInt(seekp);
                        }
                    }
                    USAGE.write();
                }
            }
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -49,7 +49,6 @@ import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@ -66,6 +65,8 @@ import java.util.TreeMap;
 import java.util.TreeSet;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.index.indexRWIEntryNew;
 import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.kelondro.kelondroBitfield;
 import de.anomic.kelondro.kelondroMSetTools;
@ -114,18 +115,121 @@ public final class plasmaCondenser {
    public int RESULT_NUMB_WORDS = -1;
    public int RESULT_DIFF_WORDS = -1;
    public int RESULT_SIMI_WORDS = -1;
    public int RESULT_WORD_ENTROPHY = -1;
    public int RESULT_NUMB_SENTENCES = -1;
    public int RESULT_DIFF_SENTENCES = -1;
    public int RESULT_SIMI_SENTENCES = -1;
    public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
-    public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException {
+    public plasmaCondenser(plasmaParserDocument document, boolean addMedia) throws UnsupportedEncodingException {
        // if addMedia == true, then all the media links are also parsed and added to the words
        // added media words are flagged with the approriate media flag
        this(document.getText(), document.getCharset());
        kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
        // construct flag set for document
        if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
        if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
        if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
        if (document.getApplinks().size()   > 0) RESULT_FLAGS.set(flag_cat_hasapp,   true);
        // the phrase counter:
        // phrase   0 are words taken from the URL
        // phrase   1 is the MainLongTitle
        // phrase   2 is the MainShortTitle
        // phrase   3 is the Document Abstract
        // phrase   4 is the Document Author
        // phrase   5 are the tags specified in document
        // phrase  10 and above are the section headlines/titles (88 possible)
        // phrase  98 is taken from the embedded anchor/hyperlinks description
        // phrase  99 is taken from the media Link url and anchor description
        // phrase 100 and above are lines from the text
        insertTextToWords(document.getMainLongTitle(),  1, indexRWIEntryNew.flag_app_descr, wflags);
        insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
        insertTextToWords(document.getAbstract(),       3, indexRWIEntryNew.flag_app_descr, wflags);
        // missing: author!
        // missing: tags!
        String[] titles = document.getSectionTitles();
        for (int i = 0; 1 < titles.length; i++) {
            insertTextToWords(titles[i], i + 10, indexRWIEntryNew.flag_app_emphasized, wflags);
        }
        // anchors
        Iterator i = document.getAnchors().entrySet().iterator();
        Map.Entry entry;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            insertTextToWords((String) entry.getKey(), 98, indexRWIEntryNew.flag_app_url, wflags);
            insertTextToWords((String) entry.getValue(), 98, indexRWIEntryNew.flag_app_url, wflags);
        }
        // audio
        i = document.getAudiolinks().entrySet().iterator();
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, wflags);
            insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, wflags);
        }
        // video
        i = document.getVideolinks().entrySet().iterator();
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, wflags);
            insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, wflags);
        }
        // applications
        i = document.getApplinks().entrySet().iterator();
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, wflags);
            insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, wflags);
        }
        // images
        i = document.getImages().iterator();
        htmlFilterImageEntry ientry;
        while (i.hasNext()) {
            ientry = (htmlFilterImageEntry) i.next();
            insertTextToWords((String) ientry.url().toNormalform(), 99, flag_cat_hasimage, wflags);
            insertTextToWords((String) ientry.alt(), 99, flag_cat_hasimage, wflags);
        }
        // finally check all words for missing flag entry
        i = words.entrySet().iterator();
        wordStatProp wprop;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            wprop = (wordStatProp) entry.getValue();
            if (wprop.flags == null) {
                wprop.flags = (kelondroBitfield) wflags.clone();
                words.put(entry.getKey(), wprop);
            }
        }
    }
    private void insertTextToWords(String text, int phrase, int flagpos, kelondroBitfield flagstemplate) {
        String word;
        wordStatProp wprop;
        sievedWordsEnum wordenum;
        try {
            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3);
        } catch (UnsupportedEncodingException e) {
            return;
        }
        int pip = 0;
        while (wordenum.hasMoreElements()) {
            word = ((String) wordenum.nextElement()).toLowerCase();
            wprop = (wordStatProp) words.get(word);
            if (wprop == null) wprop = new wordStatProp(0, pip, phrase);
            if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone();
            wprop.numOfPhrase = 1;
            wprop.posInPhrase = pip;
            wprop.flags.set(flagpos, true);
            words.put(word, wprop);
            pip++;
        }
    }
    public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
@ -174,7 +278,7 @@ public final class plasmaCondenser {
    }
    public Map words() {
-        // returns the words as wod/wordStatProp relation map
+        // returns the words as word/wordStatProp relation map
        return words;
    }
@ -183,9 +287,10 @@ public final class plasmaCondenser {
        public int              count;       // number of occurrences
        public int              posInText;   // unique handle, is initialized with word position (excluding double occurring words)
-        public int posInPhrase; //
+        public int              posInPhrase; // position of word in phrase
-        public int numOfPhrase;
+        public int              numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
-        public HashSet hash;    //
+        public HashSet          hash;        // a set of handles to all sentences where this word appears
        public kelondroBitfield flags;       // the flag bits for each word
        public wordStatProp(int handle, int pip, int nop) {
            this.count = 1;
@ -193,6 +298,7 @@ public final class plasmaCondenser {
            this.posInPhrase = pip;
            this.numOfPhrase = nop;
            this.hash = new HashSet();
            this.flags = null;
        }
        public void inc() {
@ -314,7 +420,7 @@ public final class plasmaCondenser {
                } else {
                    // word does not yet exist, create new word entry
                    wordHandle = wordHandleCount++;
-                    wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1);
+                    wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 100);
                }
                words.put(word, wsp);
                // we now have the unique handle of the word, put it into the sentence:
@ -429,7 +535,6 @@ public final class plasmaCondenser {
        this.RESULT_NUMB_WORDS = allwordcounter;
        this.RESULT_DIFF_WORDS = wordHandleCount;
        this.RESULT_SIMI_WORDS = words.size();
        this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter);
        this.RESULT_NUMB_SENTENCES = allsentencecounter;
        this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
        this.RESULT_SIMI_SENTENCES = sentences.size();
@ -508,6 +613,7 @@ public final class plasmaCondenser {
        return orderedSentences;
    }
    /*
    public void writeMapToFile(File out) throws IOException {
        Map.Entry entry;
        String k;
@ -520,7 +626,7 @@ public final class plasmaCondenser {
        // we reconstruct the word hashtable
        // and sort the entries by the number of occurrences
        // this structure is needed to print out a sorted list of words
-        TreeMap sortedWords = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/);
+        TreeMap sortedWords = new TreeMap(); //kelondroNaturalOrder.naturalOrder
        it = words.entrySet().iterator(); // enumerates the keys in ascending order
        while (it.hasNext()) {
            entry = (Map.Entry) it.next();
@ -549,7 +655,7 @@ public final class plasmaCondenser {
        }
        writer.close();
    }
-
+*/
    public final static boolean invisible(char c) {
        // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
        if ((c < ' ') || (c > 'z')) return true;
@ -771,16 +877,22 @@ public final class plasmaCondenser {
    }
    public static Map getWords(InputStream input, String charset) throws UnsupportedEncodingException {
        if (input == null) return null;
        plasmaCondenser condenser = new plasmaCondenser(input, charset);
        return condenser.words;        
    }
    public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException {
        // returns a word/wordStatProp relation map
        if (text == null) return null;
        ByteArrayInputStream buffer = new ByteArrayInputStream(text);
-        return getWords(buffer, charset);
+        return new plasmaCondenser(buffer, charset, 2, 1).words();
    }
    public static Map getWords(String text) {
        // returns a word/wordStatProp relation map
        if (text == null) return null;
        ByteArrayInputStream buffer = new ByteArrayInputStream(text.getBytes());
        try {
            return new plasmaCondenser(buffer, "UTF-8", 2, 1).words();
        } catch (UnsupportedEncodingException e) {
            return null;
        }
    }
    public static void main(String[] args) {
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -220,6 +220,7 @@ public class plasmaParserDocument {
    public Map getAnchors() {
        // returns all links embedded as anchors (clickeable entities)
        // this is a url(String)/text(String) map
        return anchors;
    }
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -52,6 +52,7 @@ import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 import de.anomic.http.httpHeader;
@ -256,26 +257,37 @@ public class plasmaSnippetCache {
        }
        if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
        //System.out.println("loaded document for URL " + url);
        final Enumeration sentences = document.getSentences(pre);
        document.close();
        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
        if (sentences == null) {
            //System.out.println("found no sentences in url " + url);
            return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
        }
        /* ===========================================================================
         * COMPUTE SNIPPET
         * =========================================================================== */        
        // we have found a parseable non-empty file: use the lines
-        line = computeSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
+
-        //System.out.println("loaded snippet for URL " + url + ": " + line);
+        // compute snippet from text
        final Enumeration sentences = document.getSentences(pre);
        if (sentences == null) return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
        String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
        // compute snippet from media
        String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
        String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
        String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
        //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
        //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
        line = "";
        if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
        if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
        if (appline   != null) line += (line.length() == 0) ? appline   : "<br />" + appline;
        //if (hrefline  != null) line += (line.length() == 0) ? hrefline  : "<br />" + hrefline;
        if (textline  != null) line += (line.length() == 0) ? textline  : "<br />" + textline;
        if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found");
        if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
        // finally store this snippet in our own cache
        storeToCache(wordhashes, urlhash, line);
        document.close();
        return new Snippet(line, source, null);
    }
@ -366,7 +378,32 @@ public class plasmaSnippetCache {
        return (String) snippetsCache.get(key);
    }
-    private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
+    private String computeMediaSnippet(Map media, Set queryhashes) {
        Iterator i = media.entrySet().iterator();
        Map.Entry entry;
        String url, desc;
        Set s;
        String result = "";
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            url = (String) entry.getKey();
            desc = (String) entry.getValue();
            s = removeAppearanceHashes(url, queryhashes);
            if (s.size() == 0) {
                result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
                continue;
            }
            s = removeAppearanceHashes(desc, s);
            if (s.size() == 0) {
                result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
                continue;
            }
        }
        if (result.length() == 0) return null;
        return result.substring(6);
    }
    private String computeTextSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
        try {
            if (sentences == null) return null;
            if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
@ -404,20 +441,43 @@ public class plasmaSnippetCache {
                    shortLineLength = ((String) sb.get(i)).length();
                }
            }
            // find a first result
-            String result = (String) sb.get(shortLineIndex);
+            String result = computeTextSnippet((String) sb.get(shortLineIndex), queryhashes, minLength, maxLength);
-            // remove all hashes that appear in the result
+            Set remaininghashes = removeAppearanceHashes(result, queryhashes);
-            hs = hashSentence(result);
+
            if (remaininghashes.size() == 0) return result;
            // the result has not all words in it.
            // find another sentence that represents the missing other words
            // and find recursively more sentences
            maxLength = maxLength - result.length();
            if (maxLength < 20) maxLength = 20;
            String nextSnippet = computeTextSnippet(sentences, remaininghashes, minLength, maxLength);
            if (nextSnippet == null) return null;
            return result + (" / " + nextSnippet);
        } catch (IndexOutOfBoundsException e) {
            log.logSevere("computeSnippet: error with string generation", e);
            return "";
        }
    }
    private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) {
        try {
            if (sentence == null) return null;
            if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
            Iterator j;
            HashMap hs;
            String hash;
            // find all hashes that appear in the sentence
            hs = hashSentence(sentence);
            j = queryhashes.iterator();
            Integer pos;
-            Set remaininghashes = new HashSet();
+            int p, minpos = sentence.length(), maxpos = -1;
            int p, minpos = result.length(), maxpos = -1;
            while (j.hasNext()) {
                hash = (String) j.next();
                pos = (Integer) hs.get(hash);
-                if (pos == null) {
+                if (pos != null) {
                    remaininghashes.add(new String(hash));
                } else {
                    p = pos.intValue();
                    if (p > maxpos) maxpos = p;
                    if (p < minpos) minpos = p;
@ -425,51 +485,62 @@ public class plasmaSnippetCache {
            }
            // check result size
            maxpos = maxpos + 10;
-            if (maxpos > result.length()) maxpos = result.length();
+            if (maxpos > sentence.length()) maxpos = sentence.length();
            if (minpos < 0) minpos = 0;
            // we have a result, but is it short enough?
            if (maxpos - minpos + 10 > maxLength) {
                // the string is too long, even if we cut at both ends
                // so cut here in the middle of the string
-                int lenb = result.length();
+                int lenb = sentence.length();
-                result = result.substring(0, (minpos + 20 > result.length()) ? result.length() : minpos + 20).trim() +
+                sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
                " [..] " +
-                result.substring((maxpos + 26 > result.length()) ? result.length() : maxpos + 26).trim();
+                sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
-                maxpos = maxpos + lenb - result.length() + 6;
+                maxpos = maxpos + lenb - sentence.length() + 6;
            }
            if (maxpos > maxLength) {
                // the string is too long, even if we cut it at the end
                // so cut it here at both ends at once
                int newlen = maxpos - minpos + 10;
                int around = (maxLength - newlen) / 2;
-                result = "[..] " + result.substring(minpos - around, ((maxpos + around) > result.length()) ? result.length() : (maxpos + around)).trim() + " [..]";
+                sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
                minpos = around;
-                maxpos = result.length() - around - 5;
+                maxpos = sentence.length() - around - 5;
            }
-            if (result.length() > maxLength) {
+            if (sentence.length() > maxLength) {
-                // trim result, 1st step (cut at right side)
+                // trim sentence, 1st step (cut at right side)
-                result = result.substring(0, maxpos).trim() + " [..]";
+                sentence = sentence.substring(0, maxpos).trim() + " [..]";
            }
-            if (result.length() > maxLength) {
+            if (sentence.length() > maxLength) {
-                // trim result, 2nd step (cut at left side)
+                // trim sentence, 2nd step (cut at left side)
-                result = "[..] " + result.substring(minpos).trim();
+                sentence = "[..] " + sentence.substring(minpos).trim();
            }
-            if (result.length() > maxLength) {
+            if (sentence.length() > maxLength) {
-                // trim result, 3rd step (cut in the middle)
+                // trim sentence, 3rd step (cut in the middle)
-                result = result.substring(6, 20).trim() + " [..] " + result.substring(result.length() - 26, result.length() - 6).trim();
+                sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
            }
-            if (queryhashes.size() == 0) return result;
+            return sentence;
            // the result has not all words in it.
            // find another sentence that represents the missing other words
            // and find recursively more sentences
            maxLength = maxLength - result.length();
            if (maxLength < 20) maxLength = 20;
            String nextSnippet = computeSnippet(sentences, remaininghashes, minLength, maxLength);
            return result + ((nextSnippet == null) ? "" : (" / " + nextSnippet));
        } catch (IndexOutOfBoundsException e) {
            log.logSevere("computeSnippet: error with string generation", e);
-            return "";
+            return null;
        }
    }
    private Set removeAppearanceHashes(String sentence, Set queryhashes) {
        // remove all hashes that appear in the sentence
        if (sentence == null) return queryhashes;
        HashMap hs = hashSentence(sentence);
        Iterator j = queryhashes.iterator();
        String hash;
        Integer pos;
        Set remaininghashes = new HashSet();
        while (j.hasNext()) {
            hash = (String) j.next();
            pos = (Integer) hs.get(hash);
            if (pos == null) {
                remaininghashes.add(new String(hash));
            }
        }
        return remaininghashes;
    }
    private HashMap hashSentence(String sentence) {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1576,7 +1576,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                checkInterruption();
                log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
-                plasmaCondenser condenser = new plasmaCondenser(document);
+                plasmaCondenser condenser = new plasmaCondenser(document, true);
                // generate citation reference
                Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
@ -1586,6 +1586,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                    checkInterruption();
                    // create a new loaded URL db entry
                    long ldate = System.currentTimeMillis();
                    indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
                            entry.url(),                               // URL
                            docDescription,                            // document description
@ -1594,7 +1595,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                            "",                                        // ETag
                            docDate,                                   // modification date
                            new Date(),                                // loaded date
-                            new Date(),                                // freshdate 
+                            new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula 
                            referrerUrlHash,                           // referer hash
                            new byte[0],                               // md5
                            (int) entry.size(),                        // size
@ -1704,7 +1705,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                                            newEntry.size(),
                                            docDate.getTime(),
                                            System.currentTimeMillis(),
                                            condenser.RESULT_WORD_ENTROPHY,
                                            language,
                                            doctype,
                                            ioLinks[0].intValue(),
@ -1749,7 +1749,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                            }
                            tmpContainers = null;
-                        }
+                        } //end: SEND PAGE INDEX TO STORAGE PEER
                        storageEndTime = System.currentTimeMillis();
                        //increment number of indexed urls
@ -2253,7 +2254,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            // get the word set
            Set words = null;
            try {
-                words = new plasmaCondenser(document).words().keySet();
+                words = new plasmaCondenser(document, true).words().keySet();
            } catch (UnsupportedEncodingException e) {
                e.printStackTrace();
            }
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -251,22 +251,21 @@ public final class plasmaWordIndex implements indexRI {
        // this is called by the switchboard to put in a new page into the index
        // use all the words in one condenser object to simultanous create index entries
-        // iterate over all words
+        int wordCount = 0;
        int urlLength = url.toString().length();
        int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
        // iterate over all words of context text
        Iterator i = condenser.words().entrySet().iterator();
        Map.Entry wentry;
        String word;
        indexRWIEntry ientry;
        plasmaCondenser.wordStatProp wprop;
        String wordHash;
        int urlLength = url.toString().length();
        int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
        while (i.hasNext()) {
            wentry = (Map.Entry) i.next();
            word = (String) wentry.getKey();
            wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
-            // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
+            assert (wprop.flags != null);
            wordHash = plasmaCondenser.word2hash(word);
            ientry = new indexRWIEntryNew(urlHash,
                        urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
                        wprop.count,
@ -279,16 +278,15 @@ public final class plasmaWordIndex implements indexRI {
                        size,
                        urlModified.getTime(),
                        System.currentTimeMillis(),
                        condenser.RESULT_WORD_ENTROPHY,
                        language,
                        doctype,
                        outlinksSame, outlinksOther,
-                        condenser.RESULT_FLAGS);
+                        wprop.flags);
-            addEntry(wordHash, ientry, System.currentTimeMillis(), false);
+            addEntry(plasmaCondenser.word2hash(word), ientry, System.currentTimeMillis(), false);
            wordCount++;
        }
-        // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
+        
-        // condenser.getWords().size() + " words, flushed " + c + " entries");
+        return wordCount;
        return condenser.RESULT_SIMI_WORDS;
    }
    public indexContainer getContainer(String wordHash, Set urlselection, long maxTime) {