two patches for performance enhancements of the index handover process from documents to the index cache:

- one word prototype is generated for each document, that is re-used when a specific word is stored.
- the index cache uses now ByteArray objects to reference to the RWI instead of byte[]. This enhances access to the the map that stores the cache. To dump the cache to the FS, the content must be sorted, but sorting takes less time than maintenance of a sorted map during caching.


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5849 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 06c878ed11
commit 5195c94838

@ -155,9 +155,14 @@ public class IndexTest {
/*
sorted map
time for kelondroMap<byte[]> generation: 1781
time for kelondroMap<byte[]> test: 2452, 0 bugs
memory for kelondroMap<byte[]>: 15 MB
time for TreeMap<byte[]> generation: 3117
time for TreeMap<byte[]> test: 3495, 0 bugs
memory for TreeMap<byte[]>: 29 MB
unsorted map
time for HashMap<ByteArray> generation: 828
time for HashMap<ByteArray> test: 953, 0 bugs
memory for HashMap<ByteArray>: 9 MB
*/

@ -80,7 +80,7 @@ public class IODispatcher <ReferenceType extends Reference> extends Thread {
public synchronized void dump(ReferenceContainerCache<ReferenceType> cache, File file, ReferenceContainerArray<ReferenceType> array) {
if (dumpQueue == null || !this.isAlive()) {
cache.dump(file, true);
cache.dump(file);
} else {
DumpJob job = new DumpJob(cache, file, array);
try {
@ -88,7 +88,7 @@ public class IODispatcher <ReferenceType extends Reference> extends Thread {
controlQueue.put(vita);
} catch (InterruptedException e) {
e.printStackTrace();
cache.dump(file, true);
cache.dump(file);
}
}
}
@ -161,7 +161,7 @@ public class IODispatcher <ReferenceType extends Reference> extends Thread {
}
public void dump() {
try {
cache.dump(file, true);
cache.dump(file);
array.mountBLOBFile(file);
} catch (IOException e) {
e.printStackTrace();

@ -322,7 +322,7 @@ public final class IndexBuffer<ReferenceType extends Reference> extends Abstract
}
public synchronized void close() {
heap.dump(this.dumpFile, true);
heap.dump(this.dumpFile);
heap = null;
hashScore.clear();
hashDate.clear();

@ -247,7 +247,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
* and is composed of the current date and the cell salt
*/
public synchronized void close() {
this.ram.dump(this.array.newContainerBLOBFile(), true);
this.ram.dump(this.array.newContainerBLOBFile());
// close all
this.ram.close();
this.array.close();

@ -30,6 +30,7 @@ import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@ -41,6 +42,7 @@ import de.anomic.kelondro.blob.HeapWriter;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.util.ByteArray;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.index.Row;
@ -50,7 +52,8 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
private final Row payloadrow;
private final ByteOrder termOrder;
private SortedMap<byte[], ReferenceContainer<ReferenceType>> cache;
//private SortedMap<byte[], ReferenceContainer<ReferenceType>> cache;
private Map<ByteArray, ReferenceContainer<ReferenceType>> cache;
/**
* opens an existing heap file in undefined mode
@ -84,7 +87,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
* another dump reading afterwards is not possible
*/
public void initWriteMode() {
this.cache = Collections.synchronizedSortedMap(new TreeMap<byte[], ReferenceContainer<ReferenceType>>(this.termOrder));
this.cache = Collections.synchronizedMap(new HashMap<ByteArray, ReferenceContainer<ReferenceType>>());
}
/**
@ -95,14 +98,15 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public void initWriteModeFromBLOB(final File blobFile) throws IOException {
Log.logInfo("indexContainerRAMHeap", "restoring rwi blob dump '" + blobFile.getName() + "'");
final long start = System.currentTimeMillis();
this.cache = Collections.synchronizedSortedMap(new TreeMap<byte[], ReferenceContainer<ReferenceType>>(this.termOrder));
//this.cache = Collections.synchronizedSortedMap(new TreeMap<byte[], ReferenceContainer<ReferenceType>>(this.termOrder));
this.cache = new HashMap<ByteArray, ReferenceContainer<ReferenceType>>();
int urlCount = 0;
synchronized (cache) {
for (final ReferenceContainer<ReferenceType> container : new blobFileEntries<ReferenceType>(blobFile, factory, this.payloadrow)) {
// TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
if (container == null) break;
//System.out.println("***DEBUG indexContainerHeap.initwriteModeFromBLOB*** container.size = " + container.size() + ", container.sorted = " + container.sorted());
cache.put(container.getTermHash(), container);
cache.put(new ByteArray(container.getTermHash()), container);
urlCount += container.size();
}
}
@ -111,7 +115,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
Log.logInfo("indexContainerRAMHeap", "finished rwi blob restore: " + cache.size() + " words, " + urlCount + " word/URL relations in " + (System.currentTimeMillis() - start) + " milliseconds");
}
public void dump(final File heapFile, boolean writeIDX) {
public void dump(final File heapFile) {
assert this.cache != null;
Log.logInfo("indexContainerRAMHeap", "creating rwi heap dump '" + heapFile.getName() + "', " + cache.size() + " rwi's");
if (heapFile.exists()) FileUtils.deletedelete(heapFile);
@ -124,36 +128,37 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
return;
}
final long startTime = System.currentTimeMillis();
// sort the map
SortedMap<byte[], ReferenceContainer<ReferenceType>> cachecopy = sortedClone();
// write wCache
long wordcount = 0, urlcount = 0;
byte[] wordHash = null, lwh;
ReferenceContainer<ReferenceType> container;
// write wCache
synchronized (cache) {
for (final Map.Entry<byte[], ReferenceContainer<ReferenceType>> entry: cache.entrySet()) {
// get entries
lwh = wordHash;
wordHash = entry.getKey();
container = entry.getValue();
// check consistency: entries must be ordered
assert (lwh == null || this.ordering().compare(wordHash, lwh) > 0);
// put entries on heap
if (container != null && wordHash.length == payloadrow.primaryKeyLength) {
//System.out.println("Dump: " + wordHash);
try {
dump.add(wordHash, container.exportCollection());
} catch (IOException e) {
e.printStackTrace();
}
urlcount += container.size();
for (final Map.Entry<byte[], ReferenceContainer<ReferenceType>> entry: cachecopy.entrySet()) {
// get entries
lwh = wordHash;
wordHash = entry.getKey();
container = entry.getValue();
// check consistency: entries must be ordered
assert (lwh == null || this.ordering().compare(wordHash, lwh) > 0);
// put entries on heap
if (container != null && wordHash.length == payloadrow.primaryKeyLength) {
//System.out.println("Dump: " + wordHash);
try {
dump.add(wordHash, container.exportCollection());
} catch (IOException e) {
e.printStackTrace();
}
wordcount++;
urlcount += container.size();
}
wordcount++;
}
try {
dump.close(writeIDX);
dump.close(true);
Log.logInfo("indexContainerRAMHeap", "finished rwi heap dump: " + wordcount + " words, " + urlcount + " word/URL relations in " + (System.currentTimeMillis() - startTime) + " milliseconds");
} catch (IOException e) {
e.printStackTrace();
@ -163,6 +168,17 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
}
}
public SortedMap<byte[], ReferenceContainer<ReferenceType>> sortedClone() {
SortedMap<byte[], ReferenceContainer<ReferenceType>> cachecopy;
synchronized (cache) {
cachecopy = new TreeMap<byte[], ReferenceContainer<ReferenceType>>(this.termOrder);
for (final Map.Entry<ByteArray, ReferenceContainer<ReferenceType>> entry: cache.entrySet()) {
cachecopy.put(entry.getKey().asBytes(), entry.getValue());
}
}
return cachecopy;
}
public int size() {
return (this.cache == null) ? 0 : this.cache.size();
}
@ -317,11 +333,14 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
private final boolean rot;
private Iterator<ReferenceContainer<ReferenceType>> iterator;
private byte[] latestTermHash;
public heapCacheIterator(byte[] startWordHash, final boolean rot) {
this.rot = rot;
if (startWordHash != null && startWordHash.length == 0) startWordHash = null;
this.iterator = (startWordHash == null) ? cache.values().iterator() : cache.tailMap(startWordHash).values().iterator();
SortedMap<byte[], ReferenceContainer<ReferenceType>> cachecopy = sortedClone();
this.iterator = (startWordHash == null) ? cachecopy.values().iterator() : cachecopy.tailMap(startWordHash).values().iterator();
this.latestTermHash = null;
// The collection's iterator will return the values in the order that their corresponding keys appear in the tree.
}
@ -336,18 +355,23 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public ReferenceContainer<ReferenceType> next() {
if (iterator.hasNext()) {
return (iterator.next()).topLevelClone();
ReferenceContainer<ReferenceType> c = iterator.next();
this.latestTermHash = c.getTermHash();
return c.topLevelClone();
}
// rotation iteration
if (!rot) {
return null;
}
iterator = cache.values().iterator();
return (iterator.next()).topLevelClone();
ReferenceContainer<ReferenceType> c = iterator.next();
this.latestTermHash = c.getTermHash();
return c.topLevelClone();
}
public void remove() {
iterator.remove();
cache.remove(new ByteArray(this.latestTermHash));
}
public Iterator<ReferenceContainer<ReferenceType>> iterator() {
@ -363,7 +387,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
* @return true, if the key is used in the heap; false othervise
*/
public boolean has(final byte[] key) {
return this.cache.containsKey(key);
return this.cache.containsKey(new ByteArray(key));
}
/**
@ -372,8 +396,8 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
* @return the indexContainer if one exist, null otherwise
*/
public ReferenceContainer<ReferenceType> get(final byte[] key, Set<String> urlselection) {
if (urlselection == null) return this.cache.get(key);
ReferenceContainer<ReferenceType> c = this.cache.get(key);
if (urlselection == null) return this.cache.get(new ByteArray(key));
ReferenceContainer<ReferenceType> c = this.cache.get(new ByteArray(key));
if (c == null) return null;
// because this is all in RAM, we must clone the entries (flat)
ReferenceContainer<ReferenceType> c1 = new ReferenceContainer<ReferenceType>(factory, c.getTermHash(), c.row(), c.size());
@ -392,7 +416,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
* @return
*/
public int count(final byte[] key) {
ReferenceContainer<ReferenceType> c = this.cache.get(key);
ReferenceContainer<ReferenceType> c = this.cache.get(new ByteArray(key));
if (c == null) return 0;
return c.size();
}
@ -405,18 +429,19 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public synchronized ReferenceContainer<ReferenceType> delete(final byte[] termHash) {
// returns the index that had been deleted
assert this.cache != null;
return cache.remove(termHash);
return cache.remove(new ByteArray(termHash));
}
public synchronized boolean remove(final byte[] termHash, final String urlHash) {
assert this.cache != null;
final ReferenceContainer<ReferenceType> c = cache.get(termHash);
ByteArray tha = new ByteArray(termHash);
final ReferenceContainer<ReferenceType> c = cache.get(tha);
if ((c != null) && (c.remove(urlHash) != null)) {
// removal successful
if (c.size() == 0) {
delete(termHash);
} else {
cache.put(termHash, c);
cache.put(tha, c);
}
return true;
}
@ -426,14 +451,15 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public synchronized int remove(final byte[] termHash, final Set<String> urlHashes) {
assert this.cache != null;
if (urlHashes.size() == 0) return 0;
final ReferenceContainer<ReferenceType> c = cache.get(termHash);
ByteArray tha = new ByteArray(termHash);
final ReferenceContainer<ReferenceType> c = cache.get(tha);
int count;
if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) {
// removal successful
if (c.size() == 0) {
delete(termHash);
} else {
cache.put(termHash, c);
cache.put(tha, c);
}
return count;
}
@ -445,8 +471,8 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
if (this.cache == null || container == null || container.size() == 0) return;
// put new words into cache
final byte[] termHash = container.getTermHash();
ReferenceContainer<ReferenceType> entries = cache.get(termHash); // null pointer exception? wordhash != null! must be cache==null
ByteArray tha = new ByteArray(container.getTermHash());
ReferenceContainer<ReferenceType> entries = cache.get(tha); // null pointer exception? wordhash != null! must be cache==null
int added = 0;
if (entries == null) {
entries = container.topLevelClone();
@ -455,7 +481,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
added = entries.putAllRecent(container);
}
if (added > 0) {
cache.put(termHash, entries);
cache.put(tha, entries);
}
entries = null;
return;
@ -463,10 +489,11 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public synchronized void add(final byte[] termHash, final ReferenceType newEntry) {
assert this.cache != null;
ReferenceContainer<ReferenceType> container = cache.get(termHash);
ByteArray tha = new ByteArray(termHash);
ReferenceContainer<ReferenceType> container = cache.get(tha);
if (container == null) container = new ReferenceContainer<ReferenceType>(factory, termHash, this.payloadrow, 1);
container.put(newEntry);
cache.put(termHash, container);
cache.put(tha, container);
}
public int minMem() {

@ -34,6 +34,7 @@ import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.MicroDate;
import de.anomic.kelondro.text.AbstractReference;
import de.anomic.kelondro.text.Reference;
import de.anomic.plasma.parser.Word;
import de.anomic.yacy.yacySeedDB;
public final class WordReferenceRow extends AbstractReference implements WordReference, Cloneable {
@ -147,6 +148,48 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_reserve2, 0);
}
public WordReferenceRow(final String urlHash,
final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components
final int titleLength, // length of description/length (longer are better?)
final int wordcount, // total number of words
final int phrasecount, // total number of phrases
final long lastmodified, // last-modified time of the document where word appears
final long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
final String language, // (guessed) language of document
final char doctype, // type of document
final int outlinksSame, // outlinks to same domain
final int outlinksOther // outlinks to other domain
) {
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
this.entry = urlEntryRow.newEntry();
final int mddlm = MicroDate.microDateDays(lastmodified);
final int mddct = MicroDate.microDateDays(updatetime);
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
this.entry.setCol(col_wordsInText, wordcount);
this.entry.setCol(col_phrasesInText, phrasecount);
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
this.entry.setCol(col_language, ((language == null) || (language.length() != urlEntryRow.width(col_language))) ? "uk" : language, null);
this.entry.setCol(col_llocal, outlinksSame);
this.entry.setCol(col_lother, outlinksOther);
this.entry.setCol(col_urlLength, urlLength);
this.entry.setCol(col_urlComps, urlComps);
this.entry.setCol(col_reserve1, 0);
this.entry.setCol(col_reserve2, 0);
}
public void setWord(final Word word) {
this.entry.setCol(col_typeofword, new byte[]{(byte) 0});
this.entry.setCol(col_flags, word.flags.bytes());
this.entry.setCol(col_hitcount, word.count);
this.entry.setCol(col_posintext, word.posInText);
this.entry.setCol(col_posinphrase, word.posInPhrase);
this.entry.setCol(col_posofphrase, word.numOfPhrase);
}
public WordReferenceRow(final String urlHash, final String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());

@ -443,30 +443,25 @@ public final class plasmaWordIndex {
final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator();
Map.Entry<String, Word> wentry;
String word;
WordReferenceRow ientry;
Word wprop;
int len = (document == null) ? urlLength : document.dc_title().length();
WordReferenceRow ientry = new WordReferenceRow(url.hash(),
urlLength, urlComps, len,
condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_SENTENCES,
urlModified.getTime(),
System.currentTimeMillis(),
language,
doctype,
outlinksSame, outlinksOther);
Word wprop;
while (i.hasNext()) {
wentry = i.next();
word = wentry.getKey();
wprop = wentry.getValue();
assert (wprop.flags != null);
ientry = new WordReferenceRow(url.hash(),
urlLength, urlComps, len,
wprop.count,
condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_SENTENCES,
wprop.posInText,
wprop.posInPhrase,
wprop.numOfPhrase,
urlModified.getTime(),
System.currentTimeMillis(),
language,
doctype,
outlinksSame, outlinksOther,
wprop.flags);
ientry.setWord(wprop);
try {
this.index.add(Word.word2hash(word), ientry); // TODO: remove getBytes()
this.index.add(Word.word2hash(word), ientry);
} catch (IOException e) {
e.printStackTrace();
}

Loading…
Cancel
Save