- added correct flagging of word properties

- added self-healing to database in case that wrong free-pointers exist - added presentation of media links in snippets (does not yet work correctly) - code cleanup git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3055 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · bf0d820659
parent 10d888e70c
commit bf0d820659
12 changed files with 342 additions and 133 deletions
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@ -300,7 +300,7 @@ public class IndexControl_p {
                         "true".equalsIgnoreCase(gzipBody),
                         timeout);
            result = (String) resultObj.get("result");
-            prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
+            prop.put("result", (result == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries + " URL not found") : result);
            index = null;
        }

--- a/htroot/yacy/transferRWI.java
+++ b/htroot/yacy/transferRWI.java
@ -203,7 +203,7 @@ public final class transferRWI {
            }
            if (unknownURLs.length() > 0) { unknownURLs.delete(0, 1); }
            if ((wordhashes.length == 0) || (received == 0)) {
-                sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs");
+                sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs, blocked " + blocked + " RWIs");
            } else {
                final double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[received - 1])) / 2.0;
                sb.getLog().logInfo("Received " + received + " Entries " + wordc + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + "/" + receivedURL + " URLs, blocked " + blocked + " RWIs");
--- a/htroot/yacy/transferURL.java
+++ b/htroot/yacy/transferURL.java
@ -46,10 +46,12 @@
 // javac -classpath .:../classes transferRWI.java

 import java.io.IOException;
+import java.text.ParseException;

 import de.anomic.http.httpHeader;
 import de.anomic.index.indexURLEntry;
 import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.plasmaURL;
 import de.anomic.plasma.urlPattern.plasmaURLPattern;
 import de.anomic.server.serverCore;
 import de.anomic.server.serverObjects;
@ -59,11 +61,14 @@ import de.anomic.yacy.yacySeed;

 public final class transferURL {

+    
    public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException {
        if (post == null || ss == null) { return null; }

        long start = System.currentTimeMillis();
-
+        long freshdate = 0;
+        try {freshdate = plasmaURL.shortDayFormatter.parse("20061101").getTime();} catch (ParseException e1) {}
+        
        // return variable that accumulates replacements
        final plasmaSwitchboard sb = (plasmaSwitchboard) ss;
        final serverObjects prop = new serverObjects();
@ -93,35 +98,45 @@ public final class transferURL {
            indexURLEntry lEntry;
            for (int i = 0; i < urlc; i++) {
                serverCore.checkInterruption();
+                
+                // read new lurl-entry
                urls = (String) post.get("url" + i);
                if (urls == null) {
                    yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
-                } else {
-                    lEntry = sb.wordIndex.loadedURL.newEntry(urls);
-                    if (lEntry == null) {
-                        yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
-                        // TODO: should we send back an error message???
-                    } else {
-                        indexURLEntry.Components comp = lEntry.comp();
-                        if (comp.url() == null) {
-                            yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
-                            // TODO: should we send back an error message???
-                        } else {
-                            if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
-                                int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
-                                yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
-                                lEntry = null;
-                                blocked++;
-                            } else try {
-                                sb.wordIndex.loadedURL.store(lEntry);
-                                sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
-                                yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
-                                received++;
-                            } catch (IOException e) {
-                                    e.printStackTrace();
-                            }
-                        }
-                    }
+                    continue;
+                }
+
+                // parse new lurl-entry
+                lEntry = sb.wordIndex.loadedURL.newEntry(urls);
+                if (lEntry == null) {
+                    yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
+                    continue;
+                }
+                
+                // check if entry is well-formed
+                indexURLEntry.Components comp = lEntry.comp();
+                if ((comp.url() == null) || (lEntry.freshdate().getTime() <= freshdate)) {
+                    yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
+                    continue;
+                }
+                
+                // check if the entry is blacklisted
+                if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
+                    int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
+                    yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
+                    lEntry = null;
+                    blocked++;
+                    continue;
+                }
+                
+                // write entry to database
+                try {
+                    sb.wordIndex.loadedURL.store(lEntry);
+                    sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
+                    yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
+                    received++;
+                } catch (IOException e) {
+                    e.printStackTrace();
                }
            }

--- a/source/de/anomic/index/indexRWIEntryNew.java
+++ b/source/de/anomic/index/indexRWIEntryNew.java
@ -117,7 +117,6 @@ public class indexRWIEntryNew  implements Cloneable, indexRWIEntry {
            int      sizeOfPage,    // # of bytes of the page TODO: not needed any more
            long     lastmodified,  // last-modified time of the document where word appears
            long     updatetime,    // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
-            int      quality,       // the entropy value
            String   language,      // (guessed) language of document
            char     doctype,       // type of document
            int      outlinksSame,  // outlinks to same domain
--- a/source/de/anomic/index/indexRWIEntryOld.java
+++ b/source/de/anomic/index/indexRWIEntryOld.java
@ -33,7 +33,6 @@ import de.anomic.kelondro.kelondroColumn;
 import de.anomic.kelondro.kelondroRow;
 import de.anomic.kelondro.kelondroRow.Entry;
 import de.anomic.plasma.plasmaSearchQuery;
-import de.anomic.plasma.plasmaURL;
 import de.anomic.plasma.plasmaWordIndex;
 import de.anomic.yacy.yacySeedDB;

@ -66,7 +65,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
    private static final int col_hitcount     =  3;
    private static final int col_language     =  4;
    private static final int col_doctype      =  5;
-    private static final int col_localflag    =  6;
+    //private static final int col_localflag    =  6;
    private static final int col_posintext    =  7;
    private static final int col_posinphrase  =  8;
    private static final int col_posofphrase  =  9;
@ -77,6 +76,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
    
    private kelondroRow.Entry entry;
    
+    /*
    public indexRWIEntryOld(String  urlHash,
            int     urlLength,    // byte-length of complete URL
            int     urlComps,     // number of path components
@ -91,7 +91,6 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
            int     sizeOfPage,   // # of bytes of the page
            long    lastmodified, //*last-modified time of the document where word appears
            long    updatetime,   // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
-            int     quality,      //*the entropy value
            String  language,     //*(guessed) language of document
            char    doctype,      //*type of document
            int     outlinksSame, // outlinks to same domain
@ -107,7 +106,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
        if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
        this.entry = urlEntryRow.newEntry();
        this.entry.setCol(col_urlhash, urlHash, null);
-        this.entry.setCol(col_quality, quality);
+        this.entry.setCol(col_quality, 0);
        this.entry.setCol(col_lastModified, lastmodified);
        this.entry.setCol(col_hitcount, hitcount);
        this.entry.setCol(col_language, language, null);
@ -121,7 +120,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
        this.entry.setCol(col_phrasecount, phrasecount);
        //System.out.println("DEBUG-NEWENTRY " + toPropertyForm());
    }
-
+*/
    public indexRWIEntryOld(String urlHash, String code) {
        // the code is the external form of the row minus the leading urlHash entry
        this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
--- a/source/de/anomic/kelondro/kelondroBitfield.java
+++ b/source/de/anomic/kelondro/kelondroBitfield.java
@ -24,7 +24,7 @@

 package de.anomic.kelondro;

-public class kelondroBitfield {
+public class kelondroBitfield implements Cloneable {

    // the bitfield implements a binary array. Such arrays may be exported in a base64-String
    
@ -55,6 +55,12 @@ public class kelondroBitfield {
        }
    }
    
+    public Object clone() {
+        kelondroBitfield theClone = new kelondroBitfield(new byte[this.bb.length]);
+        System.arraycopy(this.bb, 0, theClone.bb, 0, this.bb.length);
+        return theClone;
+    }
+    
    public void set(int pos, boolean value) {
        assert (pos >= 0);
        int slot = pos / 8;
--- a/source/de/anomic/kelondro/kelondroRecords.java
+++ b/source/de/anomic/kelondro/kelondroRecords.java
@ -1392,7 +1392,7 @@ public class kelondroRecords {
                    USAGE.FREEC--;
                    // take link
                    if (USAGE.FREEH.index == NUL) {
-                        System.out.println("INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records. Affected file: " + filename);
+                        serverLog.logSevere("kelondroRecords/" + filename, "INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records.");
                        // try to heal..
                        USAGE.USEDC = USAGE.allCount() + 1;
                        USAGE.FREEC = 0;
@ -1402,10 +1402,17 @@ public class kelondroRecords {
                        //System.out.println("*DEBUG* ALLOCATED DELETED INDEX " + index);
                        // check for valid seek position
                        long seekp = seekpos(USAGE.FREEH);
-                        if (seekp > entryFile.length()) throw new kelondroException("new Handle: seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
-                        
-                        // read link to next element of FREEH chain
-                        USAGE.FREEH.index = entryFile.readInt(seekp);
+                        if (seekp > entryFile.length()) {
+                            // this is a severe inconsistency. try to heal..
+                            serverLog.logSevere("kelondroRecords/" + filename, "new Handle: lost " + USAGE.FREEC + " marked nodes; seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
+                            index = USAGE.allCount(); // a place at the end of the file
+                            USAGE.USEDC += USAGE.FREEC; // to avoid that non-empty records at the end are overwritten
+                            USAGE.FREEC = 0; // discard all possible empty nodes
+                            USAGE.FREEH.index = NUL;
+                        } else {
+                            // read link to next element of FREEH chain
+                            USAGE.FREEH.index = entryFile.readInt(seekp);
+                        }
                    }
                    USAGE.write();
                }
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -49,7 +49,6 @@ import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
-import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@ -66,6 +65,8 @@ import java.util.TreeMap;
 import java.util.TreeSet;

 import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.htmlFilter.htmlFilterImageEntry;
+import de.anomic.index.indexRWIEntryNew;
 import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.kelondro.kelondroBitfield;
 import de.anomic.kelondro.kelondroMSetTools;
@ -114,20 +115,123 @@ public final class plasmaCondenser {
    public int RESULT_NUMB_WORDS = -1;
    public int RESULT_DIFF_WORDS = -1;
    public int RESULT_SIMI_WORDS = -1;
-    public int RESULT_WORD_ENTROPHY = -1;
    public int RESULT_NUMB_SENTENCES = -1;
    public int RESULT_DIFF_SENTENCES = -1;
    public int RESULT_SIMI_SENTENCES = -1;
    public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
    
-    public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException {
+    public plasmaCondenser(plasmaParserDocument document, boolean addMedia) throws UnsupportedEncodingException {
+        // if addMedia == true, then all the media links are also parsed and added to the words
+        // added media words are flagged with the approriate media flag
        this(document.getText(), document.getCharset());
+        
+        kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
+        // construct flag set for document
        if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
        if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
        if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
        if (document.getApplinks().size()   > 0) RESULT_FLAGS.set(flag_cat_hasapp,   true);
+        
+        // the phrase counter:
+        // phrase   0 are words taken from the URL
+        // phrase   1 is the MainLongTitle
+        // phrase   2 is the MainShortTitle
+        // phrase   3 is the Document Abstract
+        // phrase   4 is the Document Author
+        // phrase   5 are the tags specified in document
+        // phrase  10 and above are the section headlines/titles (88 possible)
+        // phrase  98 is taken from the embedded anchor/hyperlinks description
+        // phrase  99 is taken from the media Link url and anchor description
+        // phrase 100 and above are lines from the text
+      
+        insertTextToWords(document.getMainLongTitle(),  1, indexRWIEntryNew.flag_app_descr, wflags);
+        insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
+        insertTextToWords(document.getAbstract(),       3, indexRWIEntryNew.flag_app_descr, wflags);
+        // missing: author!
+        // missing: tags!
+        String[] titles = document.getSectionTitles();
+        for (int i = 0; 1 < titles.length; i++) {
+            insertTextToWords(titles[i], i + 10, indexRWIEntryNew.flag_app_emphasized, wflags);
+        }
+        
+        // anchors
+        Iterator i = document.getAnchors().entrySet().iterator();
+        Map.Entry entry;
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            insertTextToWords((String) entry.getKey(), 98, indexRWIEntryNew.flag_app_url, wflags);
+            insertTextToWords((String) entry.getValue(), 98, indexRWIEntryNew.flag_app_url, wflags);
+        }
+        
+        // audio
+        i = document.getAudiolinks().entrySet().iterator();
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, wflags);
+            insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, wflags);
+        }
+
+        // video
+        i = document.getVideolinks().entrySet().iterator();
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, wflags);
+            insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, wflags);
+        }
+
+        // applications
+        i = document.getApplinks().entrySet().iterator();
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, wflags);
+            insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, wflags);
+        }
+
+        // images
+        i = document.getImages().iterator();
+        htmlFilterImageEntry ientry;
+        while (i.hasNext()) {
+            ientry = (htmlFilterImageEntry) i.next();
+            insertTextToWords((String) ientry.url().toNormalform(), 99, flag_cat_hasimage, wflags);
+            insertTextToWords((String) ientry.alt(), 99, flag_cat_hasimage, wflags);
+        }
+        
+        // finally check all words for missing flag entry
+        i = words.entrySet().iterator();
+        wordStatProp wprop;
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            wprop = (wordStatProp) entry.getValue();
+            if (wprop.flags == null) {
+                wprop.flags = (kelondroBitfield) wflags.clone();
+                words.put(entry.getKey(), wprop);
+            }
+        }
    }
    
+    private void insertTextToWords(String text, int phrase, int flagpos, kelondroBitfield flagstemplate) {
+        String word;
+        wordStatProp wprop;
+        sievedWordsEnum wordenum;
+        try {
+            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3);
+        } catch (UnsupportedEncodingException e) {
+            return;
+        }
+        int pip = 0;
+        while (wordenum.hasMoreElements()) {
+            word = ((String) wordenum.nextElement()).toLowerCase();
+            wprop = (wordStatProp) words.get(word);
+            if (wprop == null) wprop = new wordStatProp(0, pip, phrase);
+            if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone();
+            wprop.numOfPhrase = 1;
+            wprop.posInPhrase = pip;
+            wprop.flags.set(flagpos, true);
+            words.put(word, wprop);
+            pip++;
+        }
+    }
+
    public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
        this(text, charset, 3, 2);
    }
@ -174,18 +278,19 @@ public final class plasmaCondenser {
    }

    public Map words() {
-        // returns the words as wod/wordStatProp relation map
+        // returns the words as word/wordStatProp relation map
        return words;
    }
    
    public static class wordStatProp {
        // object carries statistics for words and sentences
        
-        public int count;       // number of occurrences
-        public int posInText;   // unique handle, is initialized with word position (excluding double occurring words)
-        public int posInPhrase; //
-        public int numOfPhrase;
-        public HashSet hash;    //
+        public int              count;       // number of occurrences
+        public int              posInText;   // unique handle, is initialized with word position (excluding double occurring words)
+        public int              posInPhrase; // position of word in phrase
+        public int              numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
+        public HashSet          hash;        // a set of handles to all sentences where this word appears
+        public kelondroBitfield flags;       // the flag bits for each word

        public wordStatProp(int handle, int pip, int nop) {
            this.count = 1;
@ -193,6 +298,7 @@ public final class plasmaCondenser {
            this.posInPhrase = pip;
            this.numOfPhrase = nop;
            this.hash = new HashSet();
+            this.flags = null;
        }

        public void inc() {
@ -314,7 +420,7 @@ public final class plasmaCondenser {
                } else {
                    // word does not yet exist, create new word entry
                    wordHandle = wordHandleCount++;
-                    wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1);
+                    wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 100);
                }
                words.put(word, wsp);
                // we now have the unique handle of the word, put it into the sentence:
@ -429,7 +535,6 @@ public final class plasmaCondenser {
        this.RESULT_NUMB_WORDS = allwordcounter;
        this.RESULT_DIFF_WORDS = wordHandleCount;
        this.RESULT_SIMI_WORDS = words.size();
-        this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter);
        this.RESULT_NUMB_SENTENCES = allsentencecounter;
        this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
        this.RESULT_SIMI_SENTENCES = sentences.size();
@ -508,6 +613,7 @@ public final class plasmaCondenser {
        return orderedSentences;
    }

+    /*
    public void writeMapToFile(File out) throws IOException {
        Map.Entry entry;
        String k;
@ -520,7 +626,7 @@ public final class plasmaCondenser {
        // we reconstruct the word hashtable
        // and sort the entries by the number of occurrences
        // this structure is needed to print out a sorted list of words
-        TreeMap sortedWords = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/);
+        TreeMap sortedWords = new TreeMap(); //kelondroNaturalOrder.naturalOrder
        it = words.entrySet().iterator(); // enumerates the keys in ascending order
        while (it.hasNext()) {
            entry = (Map.Entry) it.next();
@ -549,7 +655,7 @@ public final class plasmaCondenser {
        }
        writer.close();
    }
-
+*/
    public final static boolean invisible(char c) {
        // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
        if ((c < ' ') || (c > 'z')) return true;
@ -771,16 +877,22 @@ public final class plasmaCondenser {
        
    }

-    public static Map getWords(InputStream input, String charset) throws UnsupportedEncodingException {
-        if (input == null) return null;
-        plasmaCondenser condenser = new plasmaCondenser(input, charset);
-        return condenser.words;        
-    }
-    
    public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException {
+        // returns a word/wordStatProp relation map
        if (text == null) return null;
        ByteArrayInputStream buffer = new ByteArrayInputStream(text);
-        return getWords(buffer, charset);
+        return new plasmaCondenser(buffer, charset, 2, 1).words();
+    }
+    
+    public static Map getWords(String text) {
+        // returns a word/wordStatProp relation map
+        if (text == null) return null;
+        ByteArrayInputStream buffer = new ByteArrayInputStream(text.getBytes());
+        try {
+            return new plasmaCondenser(buffer, "UTF-8", 2, 1).words();
+        } catch (UnsupportedEncodingException e) {
+            return null;
+        }
    }
    
    public static void main(String[] args) {
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -220,6 +220,7 @@ public class plasmaParserDocument {
    
    public Map getAnchors() {
        // returns all links embedded as anchors (clickeable entities)
+        // this is a url(String)/text(String) map
        return anchors;
    }
    
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -52,6 +52,7 @@ import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Set;

 import de.anomic.http.httpHeader;
@ -255,27 +256,38 @@ public class plasmaSnippetCache {
            try { resContent.close(); } catch (Exception e) {/* ignore this */}
        }
        if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
-                
-        //System.out.println("loaded document for URL " + url);
-        final Enumeration sentences = document.getSentences(pre);
-        document.close();
-        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
-        if (sentences == null) {
-            //System.out.println("found no sentences in url " + url);
-            return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
-        }
-
+        
+        
        /* ===========================================================================
         * COMPUTE SNIPPET
         * =========================================================================== */        
        // we have found a parseable non-empty file: use the lines
-        line = computeSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
-        //System.out.println("loaded snippet for URL " + url + ": " + line);
+
+        // compute snippet from text
+        final Enumeration sentences = document.getSentences(pre);
+        if (sentences == null) return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
+        String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
+        
+        // compute snippet from media
+        String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
+        String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
+        String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
+        //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
+        //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
+        
+        line = "";
+        if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
+        if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
+        if (appline   != null) line += (line.length() == 0) ? appline   : "<br />" + appline;
+        //if (hrefline  != null) line += (line.length() == 0) ? hrefline  : "<br />" + hrefline;
+        if (textline  != null) line += (line.length() == 0) ? textline  : "<br />" + textline;
+        
        if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found");
        if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);

        // finally store this snippet in our own cache
        storeToCache(wordhashes, urlhash, line);
+        document.close();
        return new Snippet(line, source, null);
    }

@ -366,7 +378,32 @@ public class plasmaSnippetCache {
        return (String) snippetsCache.get(key);
    }
    
-    private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
+    private String computeMediaSnippet(Map media, Set queryhashes) {
+        Iterator i = media.entrySet().iterator();
+        Map.Entry entry;
+        String url, desc;
+        Set s;
+        String result = "";
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            url = (String) entry.getKey();
+            desc = (String) entry.getValue();
+            s = removeAppearanceHashes(url, queryhashes);
+            if (s.size() == 0) {
+                result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
+                continue;
+            }
+            s = removeAppearanceHashes(desc, s);
+            if (s.size() == 0) {
+                result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
+                continue;
+            }
+        }
+        if (result.length() == 0) return null;
+        return result.substring(6);
+    }
+    
+    private String computeTextSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
        try {
            if (sentences == null) return null;
            if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
@ -404,20 +441,43 @@ public class plasmaSnippetCache {
                    shortLineLength = ((String) sb.get(i)).length();
                }
            }
+            
            // find a first result
-            String result = (String) sb.get(shortLineIndex);
-            // remove all hashes that appear in the result
-            hs = hashSentence(result);
+            String result = computeTextSnippet((String) sb.get(shortLineIndex), queryhashes, minLength, maxLength);
+            Set remaininghashes = removeAppearanceHashes(result, queryhashes);
+
+            if (remaininghashes.size() == 0) return result;
+            // the result has not all words in it.
+            // find another sentence that represents the missing other words
+            // and find recursively more sentences
+            maxLength = maxLength - result.length();
+            if (maxLength < 20) maxLength = 20;
+            String nextSnippet = computeTextSnippet(sentences, remaininghashes, minLength, maxLength);
+            if (nextSnippet == null) return null;
+            return result + (" / " + nextSnippet);
+        } catch (IndexOutOfBoundsException e) {
+            log.logSevere("computeSnippet: error with string generation", e);
+            return "";
+        }
+    }
+    
+    private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) {
+        try {
+            if (sentence == null) return null;
+            if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
+            Iterator j;
+            HashMap hs;
+            String hash;
+            
+            // find all hashes that appear in the sentence
+            hs = hashSentence(sentence);
            j = queryhashes.iterator();
            Integer pos;
-            Set remaininghashes = new HashSet();
-            int p, minpos = result.length(), maxpos = -1;
+            int p, minpos = sentence.length(), maxpos = -1;
            while (j.hasNext()) {
                hash = (String) j.next();
                pos = (Integer) hs.get(hash);
-                if (pos == null) {
-                    remaininghashes.add(new String(hash));
-                } else {
+                if (pos != null) {
                    p = pos.intValue();
                    if (p > maxpos) maxpos = p;
                    if (p < minpos) minpos = p;
@ -425,51 +485,62 @@ public class plasmaSnippetCache {
            }
            // check result size
            maxpos = maxpos + 10;
-            if (maxpos > result.length()) maxpos = result.length();
+            if (maxpos > sentence.length()) maxpos = sentence.length();
            if (minpos < 0) minpos = 0;
            // we have a result, but is it short enough?
            if (maxpos - minpos + 10 > maxLength) {
                // the string is too long, even if we cut at both ends
                // so cut here in the middle of the string
-                int lenb = result.length();
-                result = result.substring(0, (minpos + 20 > result.length()) ? result.length() : minpos + 20).trim() +
+                int lenb = sentence.length();
+                sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
                " [..] " +
-                result.substring((maxpos + 26 > result.length()) ? result.length() : maxpos + 26).trim();
-                maxpos = maxpos + lenb - result.length() + 6;
+                sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
+                maxpos = maxpos + lenb - sentence.length() + 6;
            }
            if (maxpos > maxLength) {
                // the string is too long, even if we cut it at the end
                // so cut it here at both ends at once
                int newlen = maxpos - minpos + 10;
                int around = (maxLength - newlen) / 2;
-                result = "[..] " + result.substring(minpos - around, ((maxpos + around) > result.length()) ? result.length() : (maxpos + around)).trim() + " [..]";
+                sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
                minpos = around;
-                maxpos = result.length() - around - 5;
+                maxpos = sentence.length() - around - 5;
            }
-            if (result.length() > maxLength) {
-                // trim result, 1st step (cut at right side)
-                result = result.substring(0, maxpos).trim() + " [..]";
+            if (sentence.length() > maxLength) {
+                // trim sentence, 1st step (cut at right side)
+                sentence = sentence.substring(0, maxpos).trim() + " [..]";
            }
-            if (result.length() > maxLength) {
-                // trim result, 2nd step (cut at left side)
-                result = "[..] " + result.substring(minpos).trim();
+            if (sentence.length() > maxLength) {
+                // trim sentence, 2nd step (cut at left side)
+                sentence = "[..] " + sentence.substring(minpos).trim();
            }
-            if (result.length() > maxLength) {
-                // trim result, 3rd step (cut in the middle)
-                result = result.substring(6, 20).trim() + " [..] " + result.substring(result.length() - 26, result.length() - 6).trim();
+            if (sentence.length() > maxLength) {
+                // trim sentence, 3rd step (cut in the middle)
+                sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
            }
-            if (queryhashes.size() == 0) return result;
-            // the result has not all words in it.
-            // find another sentence that represents the missing other words
-            // and find recursively more sentences
-            maxLength = maxLength - result.length();
-            if (maxLength < 20) maxLength = 20;
-            String nextSnippet = computeSnippet(sentences, remaininghashes, minLength, maxLength);
-            return result + ((nextSnippet == null) ? "" : (" / " + nextSnippet));
+            return sentence;
        } catch (IndexOutOfBoundsException e) {
            log.logSevere("computeSnippet: error with string generation", e);
-            return "";
+            return null;
+        }
+    }
+    
+    private Set removeAppearanceHashes(String sentence, Set queryhashes) {
+        // remove all hashes that appear in the sentence
+        if (sentence == null) return queryhashes;
+        HashMap hs = hashSentence(sentence);
+        Iterator j = queryhashes.iterator();
+        String hash;
+        Integer pos;
+        Set remaininghashes = new HashSet();
+        while (j.hasNext()) {
+            hash = (String) j.next();
+            pos = (Integer) hs.get(hash);
+            if (pos == null) {
+                remaininghashes.add(new String(hash));
+            }
        }
+        return remaininghashes;
    }
    
    private HashMap hashSentence(String sentence) {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1576,7 +1576,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                
                checkInterruption();
                log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
-                plasmaCondenser condenser = new plasmaCondenser(document);
+                plasmaCondenser condenser = new plasmaCondenser(document, true);
                
                // generate citation reference
                Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
@ -1586,6 +1586,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                    checkInterruption();
                    
                    // create a new loaded URL db entry
+                    long ldate = System.currentTimeMillis();
                    indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
                            entry.url(),                               // URL
                            docDescription,                            // document description
@ -1594,7 +1595,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                            "",                                        // ETag
                            docDate,                                   // modification date
                            new Date(),                                // loaded date
-                            new Date(),                                // freshdate 
+                            new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula 
                            referrerUrlHash,                           // referer hash
                            new byte[0],                               // md5
                            (int) entry.size(),                        // size
@ -1655,16 +1656,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                             * STORE PAGE INDEX INTO WORD INDEX DB
                             * ======================================================================== */
                            words = wordIndex.addPageIndex(
-                                    entry.url(),                                            // document url
-                                    urlHash,                                                // document url hash
-                                    docDate,                                                // document mod date
-                                    (int) entry.size(),                                     // document size
-                                    document,                                               // document content
-                                    condenser,                                              // document condenser
+                                    entry.url(),                                  // document url
+                                    urlHash,                                      // document url hash
+                                    docDate,                                      // document mod date
+                                    (int) entry.size(),                           // document size
+                                    document,                                     // document content
+                                    condenser,                                    // document condenser
                                    plasmaURL.language(entry.url()),              // document language
                                    plasmaURL.docType(document.getMimeType()),    // document type
-                                    ioLinks[0].intValue(),                                  // outlinkSame
-                                    ioLinks[1].intValue()                                   // outlinkOthers
+                                    ioLinks[0].intValue(),                        // outlinkSame
+                                    ioLinks[1].intValue()                         // outlinkOthers
                            );
                        } else {
                            /* ========================================================================
@ -1704,7 +1705,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                                            newEntry.size(),
                                            docDate.getTime(),
                                            System.currentTimeMillis(),
-                                            condenser.RESULT_WORD_ENTROPHY,
                                            language,
                                            doctype,
                                            ioLinks[0].intValue(),
@ -1749,7 +1749,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                            }
                            
                            tmpContainers = null;
-                        }
+                        } //end: SEND PAGE INDEX TO STORAGE PEER
+                        
                        storageEndTime = System.currentTimeMillis();
                        
                        //increment number of indexed urls
@ -2253,7 +2254,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            // get the word set
            Set words = null;
            try {
-                words = new plasmaCondenser(document).words().keySet();
+                words = new plasmaCondenser(document, true).words().keySet();
            } catch (UnsupportedEncodingException e) {
                e.printStackTrace();
            }
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -251,22 +251,21 @@ public final class plasmaWordIndex implements indexRI {
        // this is called by the switchboard to put in a new page into the index
        // use all the words in one condenser object to simultanous create index entries
        
-        // iterate over all words
+        int wordCount = 0;
+        int urlLength = url.toString().length();
+        int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
+        
+        // iterate over all words of context text
        Iterator i = condenser.words().entrySet().iterator();
        Map.Entry wentry;
        String word;
        indexRWIEntry ientry;
        plasmaCondenser.wordStatProp wprop;
-        String wordHash;
-        int urlLength = url.toString().length();
-        int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
-        
        while (i.hasNext()) {
            wentry = (Map.Entry) i.next();
            word = (String) wentry.getKey();
            wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
-            // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
-            wordHash = plasmaCondenser.word2hash(word);
+            assert (wprop.flags != null);
            ientry = new indexRWIEntryNew(urlHash,
                        urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
                        wprop.count,
@ -279,16 +278,15 @@ public final class plasmaWordIndex implements indexRI {
                        size,
                        urlModified.getTime(),
                        System.currentTimeMillis(),
-                        condenser.RESULT_WORD_ENTROPHY,
                        language,
                        doctype,
                        outlinksSame, outlinksOther,
-                        condenser.RESULT_FLAGS);
-            addEntry(wordHash, ientry, System.currentTimeMillis(), false);
+                        wprop.flags);
+            addEntry(plasmaCondenser.word2hash(word), ientry, System.currentTimeMillis(), false);
+            wordCount++;
        }
-        // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
-        // condenser.getWords().size() + " words, flushed " + c + " entries");
-        return condenser.RESULT_SIMI_WORDS;
+        
+        return wordCount;
    }

    public indexContainer getContainer(String wordHash, Set urlselection, long maxTime) {