enhanced indexing-caching

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@107 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 0cfe94bb66
commit ea478f3975

@ -81,6 +81,13 @@ Changes take effect immediately</td>
If this is a big number, it shows that the caching works efficiently. If this is a big number, it shows that the caching works efficiently.
</td> </td>
</tr> </tr>
<tr valign="top" class="TableCellDark">
<td class=small>Singletons Cache Size:</td>
<td class=small>#[singletonsSize]#</td>
<td class=small>
The Singletons Cache is a database that holds words that occurred only once.
</td>
</tr>
<tr valign="top" class="TableCellDark"> <tr valign="top" class="TableCellDark">
<td class=small>Maximum number of Word Caches:</td> <td class=small>Maximum number of Word Caches:</td>
<td class=small><input name="wordCacheMax" type="text" size="20" maxlength="100" value="#[wordCacheMax]#"></td> <td class=small><input name="wordCacheMax" type="text" size="20" maxlength="100" value="#[wordCacheMax]#"></td>
@ -90,6 +97,7 @@ Changes take effect immediately</td>
flushed to disc; this may last some minutes. flushed to disc; this may last some minutes.
</td> </td>
</tr> </tr>
<!--
<tr valign="top" class="TableCellDark"> <tr valign="top" class="TableCellDark">
<td class=small>Maximum waitingtime to flush word cache<br>during shut-down (seconds):</td> <td class=small>Maximum waitingtime to flush word cache<br>during shut-down (seconds):</td>
<td class=small><input name="maxWaitingWordFlush" type="text" size="6" maxlength="6" value="#[maxWaitingWordFlush]#"></td> <td class=small><input name="maxWaitingWordFlush" type="text" size="6" maxlength="6" value="#[maxWaitingWordFlush]#"></td>
@ -99,11 +107,13 @@ Changes take effect immediately</td>
time are lost. time are lost.
</td> </td>
</tr> </tr>
-->
<tr valign="top" class="TableCellLight"> <tr valign="top" class="TableCellLight">
<td class=small colspan="3"><input type="submit" name="cacheSizeSubmit" value="Enter New Cache Size"> <td class=small colspan="3"><input type="submit" name="cacheSizeSubmit" value="Enter New Cache Size">
Changes take effect immediately</td> Changes take effect immediately</td>
</td> </td>
</tr> </tr>
</form> </form>
</table> </table>
</p> </p>

@ -146,6 +146,7 @@ public class Performance_p {
prop.put("maxURLinWordCache", "" + switchboard.wordIndex.maxURLinWordCache()); prop.put("maxURLinWordCache", "" + switchboard.wordIndex.maxURLinWordCache());
prop.put("maxWaitingWordFlush", switchboard.getConfig("maxWaitingWordFlush", "180")); prop.put("maxWaitingWordFlush", switchboard.getConfig("maxWaitingWordFlush", "180"));
prop.put("wordCacheMax", switchboard.getConfig("wordCacheMax", "10000")); prop.put("wordCacheMax", switchboard.getConfig("wordCacheMax", "10000"));
prop.put("singletonsSize", switchboard.wordIndex.singletonsSize());
// return rewrite values for templates // return rewrite values for templates
return prop; return prop;

@ -44,6 +44,7 @@ import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Iterator; import java.util.Iterator;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.Map;
public class kelondroMScoreCluster { public class kelondroMScoreCluster {
@ -243,21 +244,22 @@ public class kelondroMScoreCluster {
} }
public Iterator scores(boolean up) { public Iterator scores(boolean up) {
return scores(up, Integer.MIN_VALUE, Integer.MAX_VALUE); if (up) return new simpleScoreIterator();
else return scores(false, Integer.MIN_VALUE, Integer.MAX_VALUE);
} }
public Iterator scores(boolean up, int minScore, int maxScore) { public Iterator scores(boolean up, int minScore, int maxScore) {
return new scoreIterator(up, minScore, maxScore); return new komplexScoreIterator(up, minScore, maxScore);
} }
private class scoreIterator implements Iterator { private class komplexScoreIterator implements Iterator {
boolean up; boolean up;
TreeMap keyrefDBcopy; TreeMap keyrefDBcopy;
Object n; Object n;
int min, max; int min, max;
public scoreIterator(boolean up, int minScore, int maxScore) { public komplexScoreIterator(boolean up, int minScore, int maxScore) {
this.up = up; this.up = up;
this.min = minScore; this.min = minScore;
this.max = maxScore; this.max = maxScore;
@ -299,6 +301,30 @@ public class kelondroMScoreCluster {
} }
private class simpleScoreIterator implements Iterator {
Iterator ii;
Map.Entry entry;
public simpleScoreIterator() {
ii = keyrefDB.entrySet().iterator();
}
public boolean hasNext() {
return ii.hasNext();
}
public Object next() {
entry = (Map.Entry) ii.next();
return entry.getValue();
}
public void remove() {
ii.remove();
}
}
public static void main(String[] args) { public static void main(String[] args) {
System.out.println("Test for Score: start"); System.out.println("Test for Score: start");
kelondroMScoreCluster s = new kelondroMScoreCluster(); kelondroMScoreCluster s = new kelondroMScoreCluster();

@ -65,7 +65,7 @@ public class plasmaWordIndex {
public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException { public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException {
this.databaseRoot = databaseRoot; this.databaseRoot = databaseRoot;
plasmaWordIndexClassicDB fileDB = new plasmaWordIndexClassicDB(databaseRoot, log); plasmaWordIndexClassicDB fileDB = new plasmaWordIndexClassicDB(databaseRoot, log);
this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, 1000000, log); this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, bufferkb, log);
} }
public int maxURLinWordCache() { public int maxURLinWordCache() {
@ -76,6 +76,10 @@ public class plasmaWordIndex {
return ramCache.wordCacheRAMSize(); return ramCache.wordCacheRAMSize();
} }
public int singletonsSize() {
return ramCache.singletonsSize();
}
public void setMaxWords(int maxWords) { public void setMaxWords(int maxWords) {
ramCache.setMaxWords(maxWords); ramCache.setMaxWords(maxWords);
} }

@ -82,11 +82,11 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-'; for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
} }
public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, long singletonBufferSize, serverLog log) { public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, int singletonbufferkb, serverLog log) {
// creates a new index cache // creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed // the cache has a back-end where indexes that do not fit in the cache are flushed
this.databaseRoot = databaseRoot; this.databaseRoot = databaseRoot;
this.singletonBufferSize = singletonBufferSize; this.singletonBufferSize = singletonbufferkb * 1024;
this.cache = new TreeMap(); this.cache = new TreeMap();
this.hashScore = new kelondroMScoreCluster(); this.hashScore = new kelondroMScoreCluster();
this.hashDate = new HashMap(); this.hashDate = new HashMap();
@ -132,7 +132,7 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
long wordsPerSecond = 0, wordcount = 0, urlcount = 0; long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
synchronized (cache) { synchronized (cache) {
//Iterator i = cache.entrySet().iterator(); //Iterator i = cache.entrySet().iterator();
Iterator i = hashScore.scores(false); Iterator i = hashScore.scores(true);
//Map.Entry entry; //Map.Entry entry;
String wordHash; String wordHash;
plasmaWordIndexEntryContainer container; plasmaWordIndexEntryContainer container;
@ -318,6 +318,10 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
return cache.size(); return cache.size();
} }
public int singletonsSize() {
return singletons.size();
}
public void setMaxWords(int maxWords) { public void setMaxWords(int maxWords) {
this.maxWords = maxWords; this.maxWords = maxWords;
} }
@ -341,7 +345,14 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
true); true);
} }
private int flushFromMem(String key) { private int flushFromMem(String key, boolean reintegrate) {
// this method flushes indexes out from the ram to the disc.
// at first we check the singleton database and act accordingly
// if we we are to flush an index, but see also an entry in the singletons, we
// decide upn the 'reintegrate'-Flag:
// true: do not flush to disc, but re-Integrate the singleton to the RAM
// false: flush the singleton together with container to disc
plasmaWordIndexEntryContainer container = null; plasmaWordIndexEntryContainer container = null;
long time; long time;
synchronized (cache) { synchronized (cache) {
@ -358,12 +369,13 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
// now decide where to flush that container // now decide where to flush that container
Object[] singleton = readSingleton(key); Object[] singleton = readSingleton(key);
if (singleton == null) { if (singleton == null) {
// not found in singletons
if (container.size() == 1) { if (container.size() == 1) {
// store to singleton // it is a singleton: store to singleton
storeSingleton(key, container.getOne(), time); storeSingleton(key, container.getOne(), time);
return 1; return 1;
} else { } else {
// store to back-end // store to back-end; this should be a rare case
return backend.addEntries(container, time); return backend.addEntries(container, time);
} }
} else { } else {
@ -376,20 +388,31 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
// it is superfluous to flush this, simple do nothing // it is superfluous to flush this, simple do nothing
return 0; return 0;
} else { } else {
// we flush to the backend, but remove the entry from the singletons // we flush to the backend, and the entry from the singletons
removeSingleton(key); removeSingleton(key);
return backend.addEntries(container, java.lang.Math.max(time, oldTime)); return backend.addEntries(container, java.lang.Math.max(time, oldTime));
} }
} else { } else {
// now we have more than one entry, // now we have more than one entry
// we must remove the key from the singleton database // we must remove the key from the singleton database
removeSingleton(key); removeSingleton(key);
// add this to the backend // .. and put it to the container
container.add(oldEntry); container.add(oldEntry);
if (reintegrate) {
// put singleton together with container back to ram
synchronized (cache) {
cache.put(key, container);
hashScore.setScore(key, container.size());
hashDate.put(key, new Long(time));
}
return -1;
} else {
// add this to the backend
return backend.addEntries(container, java.lang.Math.max(time, oldTime)); return backend.addEntries(container, java.lang.Math.max(time, oldTime));
} }
} }
} }
}
private boolean flushFromSingleton(String key) { private boolean flushFromSingleton(String key) {
Object[] singleton = readSingleton(key); Object[] singleton = readSingleton(key);
@ -441,31 +464,35 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
break; break;
} }
//log.logDebug("flushing high-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size()); //log.logDebug("flushing high-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size());
total += flushFromMem(key); total += flushFromMem(key, false);
} }
// flush singletons // flush singletons
while ((total < 200) && (hashScore.size() >= maxWords)) { Iterator i = hashScore.scores(true);
key = (String) hashScore.getMinObject(); ArrayList al = new ArrayList();
while ((i.hasNext()) && (total < 200)) {
key = (String) i.next();
createTime = (Long) hashDate.get(key); createTime = (Long) hashDate.get(key);
count = hashScore.getScore(key); count = hashScore.getScore(key);
if (count > 1) { if (count > 1) {
//log.logDebug("flush of singleton-key " + key + ": count too high (count=" + count + ")"); //log.logDebug("flush of singleton-key " + key + ": count too high (count=" + count + ")");
break; break;
} }
if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) { if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 90000)) {
//log.logDebug("singleton-key " + key + " is too fresh, interruptiong flush (count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size() + ")"); //log.logDebug("singleton-key " + key + " is too fresh, interrupting flush (count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size() + ")");
break; continue;
} }
//log.logDebug("flushing singleton-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size()); //log.logDebug("flushing singleton-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size());
total += flushFromMem(key); al.add(key);
total++;
} }
for (int k = 0; k < al.size(); k++) flushFromMem((String) al.get(k), true);
} }
return total; return total;
} }
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) { public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) {
flushFromMem(wordHash); flushFromMem(wordHash, false);
flushFromSingleton(wordHash); flushFromSingleton(wordHash);
return backend.getIndex(wordHash, deleteIfEmpty); return backend.getIndex(wordHash, deleteIfEmpty);
} }
@ -486,13 +513,13 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
backend.deleteIndex(wordHash); backend.deleteIndex(wordHash);
} }
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { public synchronized int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
flushFromMem(wordHash); flushFromMem(wordHash, false);
flushFromSingleton(wordHash); flushFromSingleton(wordHash);
return backend.removeEntries(wordHash, urlHashes, deleteComplete); return backend.removeEntries(wordHash, urlHashes, deleteComplete);
} }
public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) { public synchronized int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size()); //serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
flushFromMemToLimit(); flushFromMemToLimit();
//if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries"); //if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries");

@ -101,16 +101,15 @@ public class yacySearch extends Thread {
int c; int c;
while (i.hasNext()) { while (i.hasNext()) {
dhtEnum = yacyCore.dhtAgent.getDHTSeeds(true, (String) i.next()); dhtEnum = yacyCore.dhtAgent.getDHTSeeds(true, (String) i.next());
c = seedcount; c = 0;
while ((dhtEnum.hasMoreElements()) && (c > 0)) { while ((dhtEnum.hasMoreElements()) && (c < seedcount)) {
seed = (yacySeed) dhtEnum.nextElement(); seed = (yacySeed) dhtEnum.nextElement();
ranking.addScore(seed.hash, c); ranking.addScore(seed.hash, c++);
c--;
} }
} }
if (ranking.size() < seedcount) seedcount = ranking.size(); if (ranking.size() < seedcount) seedcount = ranking.size();
yacySeed[] result = new yacySeed[seedcount]; yacySeed[] result = new yacySeed[seedcount];
Iterator e = ranking.scores(false); Iterator e = ranking.scores(true);
c = 0; c = 0;
while ((e.hasNext()) && (c < result.length)) while ((e.hasNext()) && (c < result.length))
result[c++] = yacyCore.seedDB.getConnected((String) e.next()); result[c++] = yacyCore.seedDB.getConnected((String) e.next());

@ -1,2 +1,2 @@
#plasmaParser configuration file #plasmaParser configuration file
#Wed May 11 17:48:25 CEST 2005 #Thu May 12 01:40:28 CEST 2005

Loading…
Cancel
Save