replaced old caching method for computed word hashes with a better method. The word hash computation is a new performance bottleneck (after the IO bottleneck was removed with the IndexCell data structure) and a better caching for word hashes was necessary.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5821 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 9e853e1977
commit 538e375901

@ -0,0 +1,96 @@
// SimpleARC.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 17.04.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.index;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* This is a simple cache using two generations of hashtables to store the content with a LFU strategy.
* The Algorithm is described in a slightly more complex version as Adaptive Replacement Cache, "ARC".
* This version omits the ghost entry handling which is described in ARC, and keeps both cache levels
* at the same size.
*/
public class SimpleARC <K, V> {
public final static boolean accessOrder = false; // if false, then a insertion-order is used
private int cacheSize;
private LinkedHashMap<K, V> levelA, levelB;
public SimpleARC(int cacheSize) {
this.cacheSize = cacheSize / 2;
this.levelA = new LinkedHashMap<K, V>(cacheSize, 0.1f, accessOrder) {
private static final long serialVersionUID = 1L;
@Override protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
return size() > SimpleARC.this.cacheSize;
}
};
this.levelB = new LinkedHashMap<K, V>(cacheSize, 0.1f, accessOrder) {
private static final long serialVersionUID = 1L;
@Override protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
return size() > SimpleARC.this.cacheSize;
}
};
}
/**
* put a value to the cache. The value may NOT exist before.
* This restriction is used here to check possible algorithm logic error cases.
* @param s
* @param v
*/
public synchronized void put(K s, V v) {
assert this.levelA.get(s) == null;
assert this.levelB.get(s) == null;
this.levelA.put(s, v);
assert (this.levelA.size() <= cacheSize); // the cache should shrink automatically
}
/**
* get a value from the cache.
* @param s
* @return the value
*/
public synchronized V get(K s) {
V v = this.levelB.get(s);
if (v != null) return v;
v = this.levelA.remove(s);
if (v == null) return null;
// move value from A to B; since it was already removed from A, just put it to B
//System.out.println("ARC: moving A->B, size(A) = " + this.levelA.size() + ", size(B) = " + this.levelB.size());
this.levelB.put(s, v);
assert (this.levelB.size() <= cacheSize); // the cache should shrink automatically
return v;
}
/**
* clear the cache
*/
public synchronized void clear() {
this.levelA.clear();
this.levelB.clear();
}
}

@ -31,8 +31,8 @@ import java.util.Iterator;
import java.util.Locale; import java.util.Locale;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.kelondro.index.SimpleARC;
import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.Digest; import de.anomic.kelondro.order.Digest;
@ -40,7 +40,8 @@ import de.anomic.yacy.yacySeedDB;
public class Word { public class Word {
private static final ConcurrentHashMap<String, byte[]> hashCache = new ConcurrentHashMap<String, byte[]>(1000); public static final int hashCacheSize = 20000;
private static final SimpleARC<String, byte[]> hashCache = new SimpleARC<String, byte[]>(hashCacheSize);
// object carries statistics for words and sentences // object carries statistics for words and sentences
public int count; // number of occurrences public int count; // number of occurrences
@ -84,10 +85,6 @@ public class Word {
if (h != null) return h; if (h != null) return h;
h = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength).getBytes(); h = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength).getBytes();
hashCache.put(word, h); // prevent expensive MD5 computation and encoding hashCache.put(word, h); // prevent expensive MD5 computation and encoding
if (hashCache.size() > 20000) {
// prevent memory leak
hashCache.clear();
}
return h; return h;
} }

@ -442,13 +442,14 @@ public final class plasmaWordIndex {
String word; String word;
WordReferenceRow ientry; WordReferenceRow ientry;
Word wprop; Word wprop;
int len = (document == null) ? urlLength : document.dc_title().length();
while (i.hasNext()) { while (i.hasNext()) {
wentry = i.next(); wentry = i.next();
word = wentry.getKey(); word = wentry.getKey();
wprop = wentry.getValue(); wprop = wentry.getValue();
assert (wprop.flags != null); assert (wprop.flags != null);
ientry = new WordReferenceRow(url.hash(), ientry = new WordReferenceRow(url.hash(),
urlLength, urlComps, (document == null) ? urlLength : document.dc_title().length(), urlLength, urlComps, len,
wprop.count, wprop.count,
condenser.RESULT_NUMB_WORDS, condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_SENTENCES, condenser.RESULT_NUMB_SENTENCES,

Loading…
Cancel
Save