enhanced indexing-caching

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@107 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · ea478f3975
parent 0cfe94bb66
commit ea478f3975
7 changed files with 100 additions and 33 deletions
--- a/htroot/Performance_p.html
+++ b/htroot/Performance_p.html
@ -81,6 +81,13 @@ Changes take effect immediately</td>
    If this is a big number, it shows that the caching works efficiently.
    </td>
  </tr>
+  <tr valign="top" class="TableCellDark">
+    <td class=small>Singletons Cache Size:</td>
+    <td class=small>#[singletonsSize]#</td>
+    <td class=small>
+    The Singletons Cache is a database that holds words that occurred only once.
+    </td>
+  </tr>
  <tr valign="top" class="TableCellDark">
    <td class=small>Maximum number of Word Caches:</td>
    <td class=small><input name="wordCacheMax" type="text" size="20" maxlength="100" value="#[wordCacheMax]#"></td>
@ -90,6 +97,7 @@ Changes take effect immediately</td>
    flushed to disc; this may last some minutes.
    </td>
  </tr>
+  <!--
  <tr valign="top" class="TableCellDark">
    <td class=small>Maximum waitingtime to flush word cache<br>during shut-down (seconds):</td>
    <td class=small><input name="maxWaitingWordFlush" type="text" size="6" maxlength="6" value="#[maxWaitingWordFlush]#"></td>
@ -99,11 +107,13 @@ Changes take effect immediately</td>
    time are lost.
    </td>
  </tr>
+  -->
  <tr valign="top" class="TableCellLight">
    <td class=small colspan="3"><input type="submit" name="cacheSizeSubmit" value="Enter New Cache Size">
    Changes take effect immediately</td>
    </td>
  </tr>
+
 </form>
 </table>
 </p>
--- a/htroot/Performance_p.java
+++ b/htroot/Performance_p.java
@ -146,6 +146,7 @@ public class Performance_p {
        prop.put("maxURLinWordCache", "" + switchboard.wordIndex.maxURLinWordCache());
        prop.put("maxWaitingWordFlush", switchboard.getConfig("maxWaitingWordFlush", "180"));
        prop.put("wordCacheMax", switchboard.getConfig("wordCacheMax", "10000"));
+        prop.put("singletonsSize", switchboard.wordIndex.singletonsSize());
        
        // return rewrite values for templates
        return prop;
--- a/source/de/anomic/kelondro/kelondroMScoreCluster.java
+++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java
@ -44,6 +44,7 @@ import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.Iterator;
 import java.util.TreeMap;
+import java.util.Map;

 public class kelondroMScoreCluster {
    
@ -243,21 +244,22 @@ public class kelondroMScoreCluster {
    }
    
    public Iterator scores(boolean up) {
-        return scores(up, Integer.MIN_VALUE, Integer.MAX_VALUE);
+        if (up) return new simpleScoreIterator();
+        else return scores(false, Integer.MIN_VALUE, Integer.MAX_VALUE);
    }
    
    public Iterator scores(boolean up, int minScore, int maxScore) {
-        return new scoreIterator(up, minScore, maxScore);
+        return new komplexScoreIterator(up, minScore, maxScore);
    }
    
-    private class scoreIterator implements Iterator {
+    private class komplexScoreIterator implements Iterator {

        boolean up;
        TreeMap keyrefDBcopy;
        Object n;
        int min, max;
        
-        public scoreIterator(boolean up, int minScore, int maxScore) {
+        public komplexScoreIterator(boolean up, int minScore, int maxScore) {
            this.up = up;
            this.min = minScore;
            this.max = maxScore;
@ -299,6 +301,30 @@ public class kelondroMScoreCluster {
        
    }
    
+    private class simpleScoreIterator implements Iterator {
+
+        Iterator ii;
+        Map.Entry entry;
+        
+        public simpleScoreIterator() {
+            ii = keyrefDB.entrySet().iterator();
+        }
+       
+        public boolean hasNext() {
+            return ii.hasNext();
+        }
+        
+        public Object next() {
+            entry = (Map.Entry) ii.next();
+            return entry.getValue();
+        }
+        
+        public void remove() {
+            ii.remove();
+        }
+        
+    }
+        
    public static void main(String[] args) {
        System.out.println("Test for Score: start");
        kelondroMScoreCluster s = new kelondroMScoreCluster();
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -65,7 +65,7 @@ public class plasmaWordIndex {
    public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException {
        this.databaseRoot = databaseRoot;
        plasmaWordIndexClassicDB fileDB = new plasmaWordIndexClassicDB(databaseRoot, log);
-        this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, 1000000, log);
+        this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, bufferkb, log);
    }
    
    public int maxURLinWordCache() {
@ -76,6 +76,10 @@ public class plasmaWordIndex {
        return ramCache.wordCacheRAMSize();
    }
    
+    public int singletonsSize() {
+        return ramCache.singletonsSize();
+    }
+        
    public void setMaxWords(int maxWords) {
        ramCache.setMaxWords(maxWords);
    }
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@ -82,11 +82,11 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
 	for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
    }

-    public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, long singletonBufferSize, serverLog log) {
+    public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, int singletonbufferkb, serverLog log) {
        // creates a new index cache
        // the cache has a back-end where indexes that do not fit in the cache are flushed
        this.databaseRoot = databaseRoot;
-        this.singletonBufferSize = singletonBufferSize;
+        this.singletonBufferSize = singletonbufferkb * 1024;
        this.cache = new TreeMap();
 	this.hashScore = new kelondroMScoreCluster();
        this.hashDate  = new HashMap();
@ -132,7 +132,7 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
        long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
        synchronized (cache) {
            //Iterator i = cache.entrySet().iterator();
-            Iterator i = hashScore.scores(false);
+            Iterator i = hashScore.scores(true);
            //Map.Entry entry;
            String wordHash;
            plasmaWordIndexEntryContainer container;
@ -318,6 +318,10 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
        return cache.size();
    }
    
+    public int singletonsSize() {
+        return singletons.size();
+    }
+    
    public void setMaxWords(int maxWords) {
        this.maxWords = maxWords;
    }
@ -341,7 +345,14 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
                        true);
    }
    
-    private int flushFromMem(String key) {
+    private int flushFromMem(String key, boolean reintegrate) {
+        // this method flushes indexes out from the ram to the disc.
+        // at first we check the singleton database and act accordingly
+        // if we we are to flush an index, but see also an entry in the singletons, we
+        // decide upn the 'reintegrate'-Flag:
+        // true: do not flush to disc, but re-Integrate the singleton to the RAM
+        // false: flush the singleton together with container to disc
+        
        plasmaWordIndexEntryContainer container = null;
        long time;
 	synchronized (cache) {
@ -358,12 +369,13 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
        // now decide where to flush that container
        Object[] singleton = readSingleton(key);
        if (singleton == null) {
+            // not found in singletons
            if (container.size() == 1) {
-                // store to singleton
+                // it is a singleton: store to singleton
                storeSingleton(key, container.getOne(), time);
                return 1;
            } else {
-                // store to back-end
+                // store to back-end; this should be a rare case
                return backend.addEntries(container, time);
            }
        } else {
@ -376,20 +388,31 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
                    // it is superfluous to flush this, simple do nothing
                    return 0;
                } else {
-                    // we flush to the backend, but remove the entry from the singletons
+                    // we flush to the backend, and the entry from the singletons
                    removeSingleton(key);
                    return backend.addEntries(container, java.lang.Math.max(time, oldTime));
                }
            } else {
-                // now we have more than one entry,
+                // now we have more than one entry
                // we must remove the key from the singleton database
                removeSingleton(key);
-                // add this to the backend
+                // .. and put it to the container
                container.add(oldEntry);
+                if (reintegrate) {
+                    // put singleton together with container back to ram
+                    synchronized (cache) {
+                        cache.put(key, container);
+                        hashScore.setScore(key, container.size());
+                        hashDate.put(key, new Long(time));
+                    }
+                    return -1;
+                } else {
+                    // add this to the backend
                    return backend.addEntries(container, java.lang.Math.max(time, oldTime));
                }
            }
        }	
+    }
    
    private boolean flushFromSingleton(String key) {
        Object[] singleton = readSingleton(key);
@ -441,31 +464,35 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
                    break;
                }
                //log.logDebug("flushing high-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size());
-                total += flushFromMem(key);
+                total += flushFromMem(key, false);
            }
            
            // flush singletons
-            while ((total < 200) && (hashScore.size() >= maxWords)) {
-                key = (String) hashScore.getMinObject();
+            Iterator i = hashScore.scores(true);
+            ArrayList al = new ArrayList();
+            while ((i.hasNext()) && (total < 200)) {
+                key = (String) i.next();
                createTime = (Long) hashDate.get(key);
                count = hashScore.getScore(key);
                if (count > 1) {
                    //log.logDebug("flush of singleton-key " + key + ": count too high (count=" + count + ")");
                    break;
                }
-                if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) {
-                    //log.logDebug("singleton-key " + key + " is too fresh, interruptiong flush (count=" + count + ", cachesize=" + cache.size()  + ", singleton-size=" + singletons.size() + ")");
-                    break;
+                if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 90000)) {
+                    //log.logDebug("singleton-key " + key + " is too fresh, interrupting flush (count=" + count + ", cachesize=" + cache.size()  + ", singleton-size=" + singletons.size() + ")");
+                    continue;
                }
                //log.logDebug("flushing singleton-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size());
-                total += flushFromMem(key);
+                al.add(key);
+                total++;
            }
+            for (int k = 0; k < al.size(); k++) flushFromMem((String) al.get(k), true);
        }
        return total;
    }
    
    public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) {
-        flushFromMem(wordHash);
+        flushFromMem(wordHash, false);
        flushFromSingleton(wordHash);
 	return backend.getIndex(wordHash, deleteIfEmpty);
    }
@ -486,13 +513,13 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
 	backend.deleteIndex(wordHash);
    }

-    public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
-        flushFromMem(wordHash);
+    public synchronized int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
+        flushFromMem(wordHash, false);
        flushFromSingleton(wordHash);
        return backend.removeEntries(wordHash, urlHashes, deleteComplete);
    }
    
-    public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
+    public synchronized int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
 	//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
        flushFromMemToLimit();
 	//if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries");
--- a/source/de/anomic/yacy/yacySearch.java
+++ b/source/de/anomic/yacy/yacySearch.java
@ -101,16 +101,15 @@ public class yacySearch extends Thread {
        int c;
        while (i.hasNext()) {
            dhtEnum = yacyCore.dhtAgent.getDHTSeeds(true, (String) i.next());
-            c = seedcount;
-            while ((dhtEnum.hasMoreElements()) && (c > 0)) {
+            c = 0;
+            while ((dhtEnum.hasMoreElements()) && (c < seedcount)) {
                seed = (yacySeed) dhtEnum.nextElement();
-                ranking.addScore(seed.hash, c);
-                c--;
+                ranking.addScore(seed.hash, c++);
            }
        }
        if (ranking.size() < seedcount) seedcount = ranking.size();
        yacySeed[] result = new yacySeed[seedcount];
-        Iterator e = ranking.scores(false);
+        Iterator e = ranking.scores(true);
        c = 0;
        while ((e.hasNext()) && (c < result.length))
            result[c++] = yacyCore.seedDB.getConnected((String) e.next());
--- a/yacy.parser
+++ b/yacy.parser
@ -1,2 +1,2 @@
 #plasmaParser configuration file
-#Wed May 11 17:48:25 CEST 2005
+#Thu May 12 01:40:28 CEST 2005