tried to add word position to index

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1377 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 0371494010
parent f1cfee7703
commit 0371494010
10 changed files with 247 additions and 159 deletions
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@ -442,17 +442,20 @@ public class IndexControl_p {
                final Iterator en = index.elements(true);
                result.append("URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span><br><br>");
                result.append("<form action=\"IndexControl_p.html\" method=\"post\" enctype=\"multipart/form-data\">");
-                String us, uh;
+                String us;
+                String uh[] = new String[2];
                int i = 0;

                final TreeMap tm = new TreeMap();
+                plasmaWordIndexEntry xi;
                while (en.hasNext()) {
-                    uh = ((plasmaWordIndexEntry)en.next()).getUrlHash();
+                    xi = (plasmaWordIndexEntry) en.next();
+                    uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())};
                    try {
-                        us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString();
+                        us = switchboard.urlPool.loadedURL.getEntry(uh[0]).url().toString();
                        tm.put(us, uh);
                    } catch (IOException e) {
-                        tm.put(uh, uh);
+                        tm.put(uh[0], uh);
                    }
                }

@ -460,15 +463,15 @@ public class IndexControl_p {
                result.ensureCapacity((tm.size() + 2) * 384);
                while (iter.hasNext()) {
                    us = iter.next().toString();
-                    uh = (String)tm.get(us);
-                    result.append("<input type=\"checkbox\" name=\"urlhx").append(i++).append("\" value=\"").append(uh).append("\" align=\"top\">");
-                    if (us.equals(uh)) {
-                        result.append("<span class=\"tt\">").append(uh).append("&nbsp;&lt;unresolved URL Hash&gt;</span><br>");
+                    uh = (String[]) tm.get(us);
+                    result.append("<input type=\"checkbox\" name=\"urlhx").append(i++).append("\" value=\"").append(uh[0]).append("\" align=\"top\">");
+                    if (us.equals(uh[0])) {
+                        result.append("<span class=\"tt\">").append(uh[0]).append("&nbsp;&lt;unresolved URL Hash&gt;</span><br>");
                    } else {
                        result.append("<a href=\"/IndexControl_p.html?").append("keystring=").append(keystring)
-                              .append("&keyhash=").append(keyhash).append("&urlhash=").append(uh)
+                              .append("&keyhash=").append(keyhash).append("&urlhash=").append(uh[0])
                              .append("&urlstringsearch=").append("&urlstring=").append(us).append("\" class=\"tt\">")
-                              .append(uh).append("</a><span class=\"tt\">&nbsp;").append(us).append("</span><br>");
+                              .append(uh[0]).append("</a><span class=\"tt\">&nbsp;").append(us).append(", pos=").append(uh[1]).append("</span><br>");
                    }
                }
                result.append("<input type=\"hidden\" name=\"keystring\" value=\"").append(keystring).append("\">")
--- a/source/de/anomic/kelondro/kelondroNaturalOrder.java
+++ b/source/de/anomic/kelondro/kelondroNaturalOrder.java
@ -51,7 +51,7 @@ public class kelondroNaturalOrder extends kelondroAbstractOrder implements kelon
    
    boolean asc;
    
-    public static final Comparator naturalOrder = new kelondroNaturalOrder(true);
+    public static final kelondroOrder naturalOrder = new kelondroNaturalOrder(true);
    
    public kelondroNaturalOrder(boolean ascending) {
        this.asc = ascending;
@ -79,6 +79,23 @@ public class kelondroNaturalOrder extends kelondroAbstractOrder implements kelon
        return c;
    }

+    public static byte[] encodeLong(long c, int length) {
+        byte[] b = new byte[length];
+        while (length > 0) {
+            b[--length] = (byte) (c & 0xFF);
+            c >>= 8;
+        }
+        return b;
+    }
+
+    public static long decodeLong(byte[] s) {
+        long c = 0;
+        int p = 0;
+        while ((p < 8) && (p < s.length)) c = (c << 8) | ((long) s[p++] & 0xFF);
+        return c;
+    }
+
+
    // Compares its two arguments for order.
    // Returns -1, 0, or 1 as the first argument
    // is less than, equal to, or greater than the second.
--- a/source/de/anomic/kelondro/kelondroSplittedTree.java
+++ b/source/de/anomic/kelondro/kelondroSplittedTree.java
@ -58,11 +58,11 @@ public class kelondroSplittedTree implements kelondroIndex {
    private static File dbFile(File path, String filenameStub, int forkfactor, int columns, int number) {
        String ns = Integer.toHexString(number).toUpperCase();
        while (ns.length() < 2) ns = "0" + ns;
-        String fs = Integer.toHexString(forkfactor).toUpperCase();
-        while (fs.length() < 2) fs = "0" + fs;
-        String cs = Integer.toHexString(columns).toUpperCase();
-        while (cs.length() < 2) cs = "0" + cs;
-        return new File(path, filenameStub + "_" + ns + "." + fs + cs + ".ktf");
+        String ff = Integer.toHexString(forkfactor).toUpperCase();
+        while (ff.length() < 2) ff = "0" + ff;
+        String co = Integer.toHexString(columns).toUpperCase();
+        while (co.length() < 2) co = "0" + co;
+        return new File(path, filenameStub + "." + ff + "." + co + "." + ns + ".ktc");
    }

    private static boolean existsAll(File pathToFiles, String filenameStub, int forkfactor, int columns){
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -116,20 +116,70 @@ public final class plasmaCondenser {
    public int wordCount(String word) {
        // number of occurrences of one word
        // if the word did not occur, this simply returns 0
-        statProp sp = (statProp) words.get(word);
-        if (sp == null)
-            return 0;
+        wordStatProp sp = (wordStatProp) words.get(word);
+        if (sp == null) return 0;
        return sp.count;
    }

-    public static class statProp {
-        public int count;
+    public int wordPositionInText(String word) {
+        // position of word in text
+        // if unknown and word does not exist, the method returns 0
+        wordStatProp sp = (wordStatProp) words.get(word);
+        if (sp == null) return 0;
+        return sp.posInText;
+    }
+
+    public int wordPositionInPhrase(String word) {
+        // position of word in text
+        // if unknown and word does not exist, the method returns 0
+        wordStatProp sp = (wordStatProp) words.get(word);
+        if (sp == null) return 0;
+        return sp.posInPhrase;
+    }
+
+    public int wordNumberOfPhrase(String word) {
+        // position of word in text
+        // if unknown and word does not exist, the method returns 0
+        wordStatProp sp = (wordStatProp) words.get(word);
+        if (sp == null) return 0;
+        return sp.numOfPhrase;
+    }
+
+    public static class wordStatProp {
+        // object carries statistics for words and sentences
+        
+        public int count;       // number of occurrences
+        public int posInText;   // unique handle, is initialized with word position (excluding double occurring words)
+        public int posInPhrase; //
+        public int numOfPhrase;
+        public HashSet hash;    //
+
+        public wordStatProp(int handle, int pip, int nop) {
+            this.count = 1;
+            this.posInText = handle;
+            this.posInPhrase = pip;
+            this.numOfPhrase = nop;
+            this.hash = new HashSet();
+        }

-        public int handle;
+        public void inc() {
+            count++;
+        }

-        public HashSet hash;
+        public void check(int i) {
+            hash.add(Integer.toString(i));
+        }

-        public statProp(int handle) {
+    }
+    
+    public static class phraseStatProp {
+        // object carries statistics for words and sentences
+        
+        public int count;       // number of occurrences
+        public int handle;      // unique handle, is initialized with sentence counter
+        public HashSet hash;    //
+
+        public phraseStatProp(int handle) {
            this.count = 1;
            this.handle = handle;
            this.hash = new HashSet();
@ -145,6 +195,7 @@ public final class plasmaCondenser {

    }

+
    public String intString(int number, int length) {
        String s = Integer.toString(number);
        while (s.length() < length) s = "0" + s;
@ -160,13 +211,15 @@ public final class plasmaCondenser {
        String word = "";
        String k;
        int wordlen;
-        statProp sp, sp1;
+        wordStatProp wsp, wsp1;
+        phraseStatProp psp;
        int wordHandle;
        int wordHandleCount = 0;
        int sentenceHandleCount = 0;
        int allwordcounter = 0;
        int allsentencecounter = 0;
        int idx;
+        int wordInSentenceCounter = 1;
        Iterator it, it1;

        // read source
@ -183,43 +236,45 @@ public final class plasmaCondenser {
                    sentence.insert(0, word); // append at beginning
                    if (sentences.containsKey(sentence)) {
                        // sentence already exists
-                        sp = (statProp) sentences.get(sentence);
-                        sp.inc();
-                        idx = sp.handle;
-                        sentences.put(sentence, sp);
+                        psp = (phraseStatProp) sentences.get(sentence);
+                        psp.inc();
+                        idx = psp.handle;
+                        sentences.put(sentence, psp);
                    } else {
                        // create new sentence
                        idx = sentenceHandleCount++;
-                        sentences.put(sentence, new statProp(idx));
+                        sentences.put(sentence, new phraseStatProp(idx));
                    }
                    // store to the words a link to this sentence
                    it = currsentwords.iterator();
                    while (it.hasNext()) {
                        k = (String) it.next();
-                        sp = (statProp) words.get(k);
-                        sp.check(idx);
-                        words.put(k, sp);
+                        wsp = (wordStatProp) words.get(k);
+                        wsp.check(idx);
+                        words.put(k, wsp);
                    }
                }
                sentence = new StringBuffer(100);
                currsentwords.clear();
+                wordInSentenceCounter = 1;
            } else {
                // store word
                allwordcounter++;
                currsentwords.add(word);
                if (words.containsKey(word)) {
                    // word already exists
-                    sp = (statProp) words.get(word);
-                    wordHandle = sp.handle;
-                    sp.inc();
+                    wsp = (wordStatProp) words.get(word);
+                    wordHandle = wsp.posInText;
+                    wsp.inc();
                } else {
                    // word does not yet exist, create new word entry
                    wordHandle = wordHandleCount++;
-                    sp = new statProp(wordHandle);
+                    wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1);
                }
-                words.put(word, sp);
+                words.put(word, wsp);
                // we now have the unique handle of the word, put it into the sentence:
                sentence.append(intString(wordHandle, numlength));
+                wordInSentenceCounter++;
            }
        }
        // finnish last sentence
@ -227,11 +282,11 @@ public final class plasmaCondenser {
            allsentencecounter++;
            sentence.insert(0, "."); // append at beginning
            if (sentences.containsKey(sentence)) {
-                sp = (statProp) sentences.get(sentence);
-                sp.inc();
-                sentences.put(sentence, sp);
+                psp = (phraseStatProp) sentences.get(sentence);
+                psp.inc();
+                sentences.put(sentence, psp);
            } else {
-                sentences.put(sentence, new statProp(sentenceHandleCount++));
+                sentences.put(sentence, new phraseStatProp(sentenceHandleCount++));
            }
        }

@ -251,14 +306,14 @@ public final class plasmaCondenser {
                sentence = (StringBuffer) o;
                wc = (sentence.length() - 1) / numlength;
                s = new String[wc + 2];
-                sp = (statProp) sentences.get(sentence);
-                s[0] = intString(sp.count, numlength); // number of occurrences of this sentence
+                psp = (phraseStatProp) sentences.get(sentence);
+                s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
                s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
                for (int i = 0; i < wc; i++) {
                    k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
                    s[i + 2] = k;
                }
-                orderedSentences[sp.handle] = s;
+                orderedSentences[psp.handle] = s;
            }
        }

@ -270,7 +325,7 @@ public final class plasmaCondenser {
            entry = (Map.Entry) it.next();
            word = (String) entry.getKey();
            wordlen = word.length();
-            sp = (statProp) entry.getValue();
+            wsp = (wordStatProp) entry.getValue();
            for (int i = wordcut; i > 0; i--) {
                if (wordlen > i) {
                    k = word.substring(0, wordlen - i);
@ -278,20 +333,20 @@ public final class plasmaCondenser {
                        // we will delete the word 'word' and repoint the
                        // corresponding links
                        // in sentences that use this word
-                        sp1 = (statProp) words.get(k);
-                        it1 = sp.hash.iterator(); // we iterate over all sentences that refer to this word
+                        wsp1 = (wordStatProp) words.get(k);
+                        it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word
                        while (it1.hasNext()) {
                            idx = Integer.parseInt((String) it1.next()); // number of a sentence
                            s = (String[]) orderedSentences[idx];
                            for (int j = 2; j < s.length; j++) {
-                                if (s[j].equals(intString(sp.handle, numlength)))
-                                    s[j] = intString(sp1.handle, numlength);
+                                if (s[j].equals(intString(wsp.posInText, numlength)))
+                                    s[j] = intString(wsp1.posInText, numlength);
                            }
                            orderedSentences[idx] = s;
                        }
                        // update word counter
-                        sp1.count = sp1.count + sp.count;
-                        words.put(k, sp1);
+                        wsp1.count = wsp1.count + wsp.count;
+                        words.put(k, wsp1);
                        // remove current word
                        it.remove();
                        continue wordsearch;
@ -311,16 +366,16 @@ public final class plasmaCondenser {
                sentence.append(((String[]) orderedSentences[i])[j]);
            if (sentences.containsKey(sentence)) {
                // add sentence counter to counter of found sentence
-                sp = (statProp) sentences.get(sentence);
-                sp.count = sp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
-                sentences.put(sentence, sp);
+                psp = (phraseStatProp) sentences.get(sentence);
+                psp.count = psp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
+                sentences.put(sentence, psp);
                // System.out.println("Found double occurring sentence " + i + "
                // = " + sp.handle);
            } else {
                // create new sentence entry
-                sp = new statProp(i);
-                sp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]);
-                sentences.put(sentence, sp);
+                psp = new phraseStatProp(i);
+                psp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]);
+                sentences.put(sentence, psp);
            }
        }

@ -351,7 +406,7 @@ public final class plasmaCondenser {
        // and order the entries by the number of the sentence
        // this structure is only needed to reconstruct the text
        String word;
-        statProp sp;
+        wordStatProp wsp;
        Map.Entry entry;
        Iterator it;
        String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack...
@ -359,8 +414,8 @@ public final class plasmaCondenser {
        while (it.hasNext()) {
            entry = (Map.Entry) it.next();
            word = (String) entry.getKey();
-            sp = (statProp) entry.getValue();
-            orderedWords[sp.handle] = word;
+            wsp = (wordStatProp) entry.getValue();
+            orderedWords[wsp.posInText] = word;
        }

        Object[] orderedSentences = makeOrderedSentences();
@ -388,7 +443,7 @@ public final class plasmaCondenser {
        // this structure is needed to present the strings in the right order in a printout
        int wc;
        Iterator it;
-        statProp sp;
+        phraseStatProp psp;
        String[] s;
        StringBuffer sentence;
        Object[] orderedSentences = new Object[sentences.size()];
@ -399,12 +454,12 @@ public final class plasmaCondenser {
            sentence = (StringBuffer) it.next();
            wc = (sentence.length() - 1) / numlength;
            s = new String[wc + 2];
-            sp = (statProp) sentences.get(sentence);
-            s[0] = intString(sp.count, numlength); // number of occurrences of this sentence
+            psp = (phraseStatProp) sentences.get(sentence);
+            s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
            s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
            for (int i = 0; i < wc; i++)
                s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
-            orderedSentences[sp.handle] = s;
+            orderedSentences[psp.handle] = s;
        }
        return orderedSentences;
    }
@ -414,7 +469,7 @@ public final class plasmaCondenser {
        String k;
        String word;
        Iterator it;
-        statProp sp;
+        wordStatProp wsp;

        Object[] orderedSentences = makeOrderedSentences();

@ -426,8 +481,8 @@ public final class plasmaCondenser {
        while (it.hasNext()) {
            entry = (Map.Entry) it.next();
            word = (String) entry.getKey();
-            sp = (statProp) entry.getValue();
-            sortedWords.put(intString(sp.count, numlength) + intString(sp.handle, numlength), word);
+            wsp = (wordStatProp) entry.getValue();
+            sortedWords.put(intString(wsp.count, numlength) + intString(wsp.posInText, numlength), word);
        }

        // start writing of words and sentences
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1316,14 +1316,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                            
                            // iterate over all words
                            Iterator i = condenser.getWords().iterator();
-                            int p = 0;
                            while (i.hasNext()) {
                                String word = (String) i.next();
-                                int count = condenser.wordCount(word);
                                String wordHash = plasmaWordIndexEntry.word2hash(word);
                                plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash);
-                                plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0,
-                                        plasmaWordIndex.microDateDays(docDate), quality, language, doctype, true);
+                                plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
+                                                                                             condenser.wordCount(word),
+                                                                                             condenser.wordPositionInText(word),
+                                                                                             condenser.wordPositionInPhrase(word),
+                                                                                             condenser.wordNumberOfPhrase(word),
+                                                                                             docDate.getTime(),
+                                                                                             quality, language, doctype, true);
                                wordIdxEntity.addEntry(wordIdxEntry);
                                tmpEntities.add(wordIdxEntity);
                                // wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -109,55 +109,65 @@ public final class plasmaWordIndex {
    private static final int day  = 86400000;
    
    public static int microDateDays(Date modified) {
-	// this calculates a virtual age from a given date
-	// the purpose is to have an age in days of a given modified date
-	// from a fixed standpoint in the past
-	// one day has 60*60*24 seconds = 86400 seconds
-	// we take mod 64**3 = 262144, this is the mask of the storage
-	return (int) ((modified.getTime() / day) % 262144);
+        return microDateDays(modified.getTime());
    }
    
+    public static int microDateDays(long modified) {
+        // this calculates a virtual age from a given date
+        // the purpose is to have an age in days of a given modified date
+        // from a fixed standpoint in the past
+        // one day has 60*60*24 seconds = 86400 seconds
+        // we take mod 64**3 = 262144, this is the mask of the storage
+        return (int) ((modified / day) % 262144);
+    }
+        
    public static String microDateHoursStr(long time) {
-	return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3);
+        return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3);
    }
    
    public static int microDateHoursInt(long time) {
-	return (int) ((time / hour) % 262144);
+        return (int) ((time / hour) % 262144);
    }
    
    public static int microDateHoursAge(String mdhs) {
        return microDateHoursInt(System.currentTimeMillis()) - (int) kelondroBase64Order.enhancedCoder.decodeLong(mdhs);
    }
    
-    public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser,
-                                   String language, char doctype) {
+    public static long reverseMicroDateDays(int microDateDays) {
+        return ((long) microDateDays) * ((long) day);
+    }
+    
+    public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser, String language, char doctype) {
        // this is called by the switchboard to put in a new page into the index
-	// use all the words in one condenser object to simultanous create index entries
-	int age = microDateDays(urlModified);
-	int quality = 0;
-	try {
-	    quality = condenser.RESULT_INFORMATION_VALUE;
-	} catch (NumberFormatException e) {
-	    System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString());
-	}
+        // use all the words in one condenser object to simultanous create index
+        // entries
+        // int age = microDateDays(urlModified);
+        int quality = 0;
+        try {
+            quality = condenser.RESULT_INFORMATION_VALUE;
+        } catch (NumberFormatException e) {
+            System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString());
+        }

        // iterate over all words
-	Iterator i = condenser.getWords().iterator();
-	String word;
-	int count;
-	plasmaWordIndexEntry entry;
-	String wordHash;
-	int p = 0;
-	while (i.hasNext()) {
-	    word = (String) i.next();
-	    count = condenser.wordCount(word);
-	    //if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ": " + c);
-	    wordHash = plasmaWordIndexEntry.word2hash(word);
-	    entry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0,
-                                         age, quality, language, doctype, true);
-	    addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false);
-	}
-	//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
+        Iterator i = condenser.getWords().iterator();
+        String word;
+        plasmaWordIndexEntry entry;
+        String wordHash;
+        while (i.hasNext()) {
+            word = (String) i.next();
+            // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
+            wordHash = plasmaWordIndexEntry.word2hash(word);
+            entry = new plasmaWordIndexEntry(urlHash,
+                                             condenser.wordCount(word),
+                                             condenser.wordPositionInText(word),
+                                             condenser.wordPositionInPhrase(word),
+                                             condenser.wordNumberOfPhrase(word),
+                                             urlModified.getTime(), quality, language, doctype, true);
+            addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false);
+        }
+        // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
+        // condenser.getWords().size() + " words, flushed " + c + " entries");
        return condenser.getWords().size();
    }
    
@ -409,12 +419,15 @@ public final class plasmaWordIndex {
    }

    public static void main(String[] args) {
-//      System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
+        // System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
+        // System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis()))));
+        
        plasmaWordIndex index = new plasmaWordIndex(new File("D:\\dev\\proxy\\DATA\\PLASMADB"), 555, new serverLog("TESTAPP"));
        Iterator iter = index.wordHashes("5A8yhZMh_Kmv", true, true);
        while (iter.hasNext()) {
            System.out.println("File: " + (String) iter.next());
        }
+        
    }

 }
--- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java
+++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java
@ -138,7 +138,7 @@ public final class plasmaWordIndexAssortment {
        for (int i = 0; i < assortmentLength; i++) {
            entry = (plasmaWordIndexEntry) entries.next();
            row[3 + 2 * i] = entry.getUrlHash().getBytes();
-	        row[4 + 2 * i] = entry.toEncodedForm(true).getBytes();
+	        row[4 + 2 * i] = entry.toEncodedForm(1).getBytes();
        }
        byte[][] oldrow = null;
        try {
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@ -153,7 +153,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
                            row[1] = kelondroRecords.long2bytes(container.size(), 4);
                            row[2] = kelondroRecords.long2bytes(updateTime, 8);
                            row[3] = wordEntry.getUrlHash().getBytes();
-                            row[4] = wordEntry.toEncodedForm(true).getBytes();
+                            row[4] = wordEntry.toEncodedForm(1).getBytes();
                            dumpArray.set((int) urlcount++, row);
                        }
                    }
--- a/source/de/anomic/plasma/plasmaWordIndexEntity.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java
@ -168,7 +168,7 @@ public final class plasmaWordIndexEntity {
    public boolean addEntry(plasmaWordIndexEntry entry) throws IOException {
        if (entry == null) return false;
    if (theTmpMap == null) {
-        return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm(false).getBytes()) == null);
+        return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm(0).getBytes()) == null);
    } else {
        return (theTmpMap.put(entry.getUrlHash(), entry) == null);
    }
--- a/source/de/anomic/plasma/plasmaWordIndexEntry.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java
@ -78,7 +78,7 @@ public final class plasmaWordIndexEntry {
    private int    posintext;   // first position of the word in text as number of word; 0=unknown or irrelevant position
    private int    posinphrase; // position within a phrase of the word
    private int    posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text
-    private int    age;         // calculated by using last-modified
+    private long   lastModified;// calculated by using last-modified
    private int    quality;     // result of a heuristic on the source file
    private byte[] language;    // essentially the country code (the TLD as heuristic), two letters lowercase only
    private char   doctype;     // type of source
@ -186,17 +186,36 @@ public final class plasmaWordIndexEntry {

    // the class instantiation can only be done by a plasmaStore method
    // therefore they are all public
-    public plasmaWordIndexEntry(String urlHash, int count, int posintext, int posinphrase, int posofphrase, int virtualage, int quality, String language, char doctype, boolean local) {
-
-    // ** hier fehlt noch als Attribut: <Wortposition im Text>, damit 'nearby' getrackt werden kann **
+    public plasmaWordIndexEntry(String urlHash,
+                                int count,        // how often appears this word in the text
+                                int posintext,
+                                int posinphrase,
+                                int posofphrase,
+                                long time,
+                                int quality,
+                                String language,
+                                char doctype, 
+                                boolean local) {

+        // more needed attributes:
+        // - int: length of text / total number of words
+        // - int: length of text / total number of sentences
+        // - long: update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
+        // - int: word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
+        // - char: category of appearance (header, title, section, text, anchor-descr, image-tag etc)
+        // - boolean: appears in title, appears in header, appears in ....
+        // - int: url-length (shorter are better)
+        // - int: url-number of components / length of path
+        // - int: length of description tag / title tag (longer are better)
+        // - int: number of chapters
+        
    if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";
        this.urlHash = urlHash;
        this.count = count;
        this.posintext = posintext;
        this.posinphrase = posinphrase;
        this.posofphrase = posofphrase;
-        this.age = virtualage;
+        this.lastModified = time;
        this.quality = quality;
        this.language = language.getBytes();
        this.doctype = doctype;
@ -210,7 +229,7 @@ public final class plasmaWordIndexEntry {
        this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0;
        this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0;
        this.posofphrase = (code.length() >= 16) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0;
-        this.age = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6));
+        this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6)));
        this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(0, 3));
        this.language = code.substring(8, 10).getBytes();
        this.doctype = code.charAt(10);
@ -231,57 +250,31 @@ public final class plasmaWordIndexEntry {
       this.posintext = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("t", "__"));
       this.posinphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("r", "__"));
       this.posofphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("o", "__"));
-       this.age = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A"));
+       this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A")));
       this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__"));
       this.language = pr.getProperty("l", "uk").getBytes();
       this.doctype = pr.getProperty("d", "u").charAt(0);
       this.localflag = pr.getProperty("f", ""+LT_LOCAL).charAt(0);
    }
-
-    private String b64save(long x, int l) {
-        try {
-            return kelondroBase64Order.enhancedCoder.encodeLong(x, l);
-        } catch (Exception e) {
-            // if x does not fit into l
-            return "________".substring(0, l);
-        }
-    }
    
-    public String toEncodedForm(boolean longAttr) {
+    public String toEncodedForm(int outputFormat) {
       // attention: this integrates NOT the URL into the encoding
       // if you need a complete dump, use toExternalForm()
-       StringBuffer buf = new StringBuffer(longAttr?18:12);
+       StringBuffer buf = new StringBuffer((outputFormat >= 1) ? 18 : 12);
       
-       buf.append(b64save(this.quality, plasmaURL.urlQualityLength))
-          .append(b64save(this.age, 3))
-          .append(b64save(this.count, 2))
+       buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength))
+          .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
+          .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.count, 2))
          .append(new String(this.language))
          .append(this.doctype)
          .append(this.localflag); // 3 + 3 + 2 + 2 + 1 + 1 = 12 bytes
           
-       if (longAttr)
-           buf.append(b64save(this.posintext, 2))
-              .append(b64save(this.posinphrase, 2))
-              .append(b64save(this.posofphrase, 2));
+       if (outputFormat >= 1)
+           buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
+              .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
+              .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2));
       
       return buf.toString();
-       
-//       String shortAttr =
-//               b64save(quality, plasmaCrawlLURL.urlQualityLength) +
-//               b64save(age, 3) +
-//               b64save(count, 2) +
-//               new String(language) +
-//               doctype +
-//               localflag; // 3 + 3 + 2 + 2 + 1 + 1 = 12 bytes
-//       if (longAttr) 
-//           return
-//               shortAttr +
-//                   b64save(posintext, 2) +
-//                   b64save(posinphrase, 2) +
-//                   b64save(posofphrase, 2);
-//       // 12 + 3 + 2 + 2 + 1 + 1 = 12 bytes
-//       else
-//           return shortAttr;
   }
    
   public String toExternalForm() {
@ -289,15 +282,15 @@ public final class plasmaWordIndexEntry {
       
       str.append("{")
           .append("h=").append(this.urlHash)
-           .append(",q=").append(b64save(this.quality, plasmaURL.urlQualityLength))
-           .append(",a=").append(b64save(this.age, 3))
-           .append(",c=").append(b64save(this.count, 2))
+           .append(",q=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength))
+           .append(",a=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
+           .append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.count, 2))
           .append(",l=").append(new String(this.language))
           .append(",d=").append(this.doctype)
           .append(",f=").append(this.localflag)
-           .append(",t=").append(b64save(this.posintext, 2))
-           .append(",r=").append(b64save(this.posinphrase, 2))
-           .append(",o=").append(b64save(this.posofphrase, 2))
+           .append(",t=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
+           .append(",r=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
+           .append(",o=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2))
       .append("}");
       
       return str.toString();
@ -312,7 +305,11 @@ public final class plasmaWordIndexEntry {
    }

    public int getVirtualAge() {
-        return age;
+        return plasmaWordIndex.microDateDays(lastModified);
+    }
+    
+    public long getLastModified() {
+        return lastModified;
    }
    
    public int getCount() {