enhanced indexing-caching

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@107 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 0cfe94bb66
commit ea478f3975

@ -81,6 +81,13 @@ Changes take effect immediately</td>
If this is a big number, it shows that the caching works efficiently.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Singletons Cache Size:</td>
<td class=small>#[singletonsSize]#</td>
<td class=small>
The Singletons Cache is a database that holds words that occurred only once.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Maximum number of Word Caches:</td>
<td class=small><input name="wordCacheMax" type="text" size="20" maxlength="100" value="#[wordCacheMax]#"></td>
@ -90,6 +97,7 @@ Changes take effect immediately</td>
flushed to disc; this may last some minutes.
</td>
</tr>
<!--
<tr valign="top" class="TableCellDark">
<td class=small>Maximum waitingtime to flush word cache<br>during shut-down (seconds):</td>
<td class=small><input name="maxWaitingWordFlush" type="text" size="6" maxlength="6" value="#[maxWaitingWordFlush]#"></td>
@ -99,11 +107,13 @@ Changes take effect immediately</td>
time are lost.
</td>
</tr>
-->
<tr valign="top" class="TableCellLight">
<td class=small colspan="3"><input type="submit" name="cacheSizeSubmit" value="Enter New Cache Size">
Changes take effect immediately</td>
</td>
</tr>
</form>
</table>
</p>

@ -146,6 +146,7 @@ public class Performance_p {
prop.put("maxURLinWordCache", "" + switchboard.wordIndex.maxURLinWordCache());
prop.put("maxWaitingWordFlush", switchboard.getConfig("maxWaitingWordFlush", "180"));
prop.put("wordCacheMax", switchboard.getConfig("wordCacheMax", "10000"));
prop.put("singletonsSize", switchboard.wordIndex.singletonsSize());
// return rewrite values for templates
return prop;

@ -44,6 +44,7 @@ import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Map;
public class kelondroMScoreCluster {
@ -243,21 +244,22 @@ public class kelondroMScoreCluster {
}
public Iterator scores(boolean up) {
return scores(up, Integer.MIN_VALUE, Integer.MAX_VALUE);
if (up) return new simpleScoreIterator();
else return scores(false, Integer.MIN_VALUE, Integer.MAX_VALUE);
}
public Iterator scores(boolean up, int minScore, int maxScore) {
return new scoreIterator(up, minScore, maxScore);
return new komplexScoreIterator(up, minScore, maxScore);
}
private class scoreIterator implements Iterator {
private class komplexScoreIterator implements Iterator {
boolean up;
TreeMap keyrefDBcopy;
Object n;
int min, max;
public scoreIterator(boolean up, int minScore, int maxScore) {
public komplexScoreIterator(boolean up, int minScore, int maxScore) {
this.up = up;
this.min = minScore;
this.max = maxScore;
@ -299,6 +301,30 @@ public class kelondroMScoreCluster {
}
private class simpleScoreIterator implements Iterator {
Iterator ii;
Map.Entry entry;
public simpleScoreIterator() {
ii = keyrefDB.entrySet().iterator();
}
public boolean hasNext() {
return ii.hasNext();
}
public Object next() {
entry = (Map.Entry) ii.next();
return entry.getValue();
}
public void remove() {
ii.remove();
}
}
public static void main(String[] args) {
System.out.println("Test for Score: start");
kelondroMScoreCluster s = new kelondroMScoreCluster();

@ -65,7 +65,7 @@ public class plasmaWordIndex {
public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException {
this.databaseRoot = databaseRoot;
plasmaWordIndexClassicDB fileDB = new plasmaWordIndexClassicDB(databaseRoot, log);
this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, 1000000, log);
this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, bufferkb, log);
}
public int maxURLinWordCache() {
@ -76,6 +76,10 @@ public class plasmaWordIndex {
return ramCache.wordCacheRAMSize();
}
public int singletonsSize() {
return ramCache.singletonsSize();
}
public void setMaxWords(int maxWords) {
ramCache.setMaxWords(maxWords);
}

@ -82,11 +82,11 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
}
public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, long singletonBufferSize, serverLog log) {
public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, int singletonbufferkb, serverLog log) {
// creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed
this.databaseRoot = databaseRoot;
this.singletonBufferSize = singletonBufferSize;
this.singletonBufferSize = singletonbufferkb * 1024;
this.cache = new TreeMap();
this.hashScore = new kelondroMScoreCluster();
this.hashDate = new HashMap();
@ -132,7 +132,7 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
synchronized (cache) {
//Iterator i = cache.entrySet().iterator();
Iterator i = hashScore.scores(false);
Iterator i = hashScore.scores(true);
//Map.Entry entry;
String wordHash;
plasmaWordIndexEntryContainer container;
@ -318,6 +318,10 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
return cache.size();
}
public int singletonsSize() {
return singletons.size();
}
public void setMaxWords(int maxWords) {
this.maxWords = maxWords;
}
@ -341,7 +345,14 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
true);
}
private int flushFromMem(String key) {
private int flushFromMem(String key, boolean reintegrate) {
// this method flushes indexes out from the ram to the disc.
// at first we check the singleton database and act accordingly
// if we we are to flush an index, but see also an entry in the singletons, we
// decide upn the 'reintegrate'-Flag:
// true: do not flush to disc, but re-Integrate the singleton to the RAM
// false: flush the singleton together with container to disc
plasmaWordIndexEntryContainer container = null;
long time;
synchronized (cache) {
@ -358,12 +369,13 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
// now decide where to flush that container
Object[] singleton = readSingleton(key);
if (singleton == null) {
// not found in singletons
if (container.size() == 1) {
// store to singleton
// it is a singleton: store to singleton
storeSingleton(key, container.getOne(), time);
return 1;
} else {
// store to back-end
// store to back-end; this should be a rare case
return backend.addEntries(container, time);
}
} else {
@ -376,20 +388,31 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
// it is superfluous to flush this, simple do nothing
return 0;
} else {
// we flush to the backend, but remove the entry from the singletons
// we flush to the backend, and the entry from the singletons
removeSingleton(key);
return backend.addEntries(container, java.lang.Math.max(time, oldTime));
}
} else {
// now we have more than one entry,
// now we have more than one entry
// we must remove the key from the singleton database
removeSingleton(key);
// add this to the backend
// .. and put it to the container
container.add(oldEntry);
if (reintegrate) {
// put singleton together with container back to ram
synchronized (cache) {
cache.put(key, container);
hashScore.setScore(key, container.size());
hashDate.put(key, new Long(time));
}
return -1;
} else {
// add this to the backend
return backend.addEntries(container, java.lang.Math.max(time, oldTime));
}
}
}
}
private boolean flushFromSingleton(String key) {
Object[] singleton = readSingleton(key);
@ -441,31 +464,35 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
break;
}
//log.logDebug("flushing high-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size());
total += flushFromMem(key);
total += flushFromMem(key, false);
}
// flush singletons
while ((total < 200) && (hashScore.size() >= maxWords)) {
key = (String) hashScore.getMinObject();
Iterator i = hashScore.scores(true);
ArrayList al = new ArrayList();
while ((i.hasNext()) && (total < 200)) {
key = (String) i.next();
createTime = (Long) hashDate.get(key);
count = hashScore.getScore(key);
if (count > 1) {
//log.logDebug("flush of singleton-key " + key + ": count too high (count=" + count + ")");
break;
}
if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) {
//log.logDebug("singleton-key " + key + " is too fresh, interruptiong flush (count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size() + ")");
break;
if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 90000)) {
//log.logDebug("singleton-key " + key + " is too fresh, interrupting flush (count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size() + ")");
continue;
}
//log.logDebug("flushing singleton-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size());
total += flushFromMem(key);
al.add(key);
total++;
}
for (int k = 0; k < al.size(); k++) flushFromMem((String) al.get(k), true);
}
return total;
}
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) {
flushFromMem(wordHash);
flushFromMem(wordHash, false);
flushFromSingleton(wordHash);
return backend.getIndex(wordHash, deleteIfEmpty);
}
@ -486,13 +513,13 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
backend.deleteIndex(wordHash);
}
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
flushFromMem(wordHash);
public synchronized int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
flushFromMem(wordHash, false);
flushFromSingleton(wordHash);
return backend.removeEntries(wordHash, urlHashes, deleteComplete);
}
public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
public synchronized int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
flushFromMemToLimit();
//if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries");

@ -101,16 +101,15 @@ public class yacySearch extends Thread {
int c;
while (i.hasNext()) {
dhtEnum = yacyCore.dhtAgent.getDHTSeeds(true, (String) i.next());
c = seedcount;
while ((dhtEnum.hasMoreElements()) && (c > 0)) {
c = 0;
while ((dhtEnum.hasMoreElements()) && (c < seedcount)) {
seed = (yacySeed) dhtEnum.nextElement();
ranking.addScore(seed.hash, c);
c--;
ranking.addScore(seed.hash, c++);
}
}
if (ranking.size() < seedcount) seedcount = ranking.size();
yacySeed[] result = new yacySeed[seedcount];
Iterator e = ranking.scores(false);
Iterator e = ranking.scores(true);
c = 0;
while ((e.hasNext()) && (c < result.length))
result[c++] = yacyCore.seedDB.getConnected((String) e.next());

@ -1,2 +1,2 @@
#plasmaParser configuration file
#Wed May 11 17:48:25 CEST 2005
#Thu May 12 01:40:28 CEST 2005

Loading…
Cancel
Save