two patches for performance enhancements of the index handover process from documents to the index cache:

- one word prototype is generated for each document, that is re-used when a specific word is stored.
- the index cache uses now ByteArray objects to reference to the RWI instead of byte[]. This enhances access to the the map that stores the cache. To dump the cache to the FS, the content must be sorted, but sorting takes less time than maintenance of a sorted map during caching.


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5849 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 06c878ed11
commit 5195c94838

@ -155,9 +155,14 @@ public class IndexTest {
/* /*
sorted map
time for TreeMap<byte[]> generation: 3117 time for kelondroMap<byte[]> generation: 1781
time for TreeMap<byte[]> test: 3495, 0 bugs time for kelondroMap<byte[]> test: 2452, 0 bugs
memory for TreeMap<byte[]>: 29 MB memory for kelondroMap<byte[]>: 15 MB
unsorted map
time for HashMap<ByteArray> generation: 828
time for HashMap<ByteArray> test: 953, 0 bugs
memory for HashMap<ByteArray>: 9 MB
*/ */

@ -80,7 +80,7 @@ public class IODispatcher <ReferenceType extends Reference> extends Thread {
public synchronized void dump(ReferenceContainerCache<ReferenceType> cache, File file, ReferenceContainerArray<ReferenceType> array) { public synchronized void dump(ReferenceContainerCache<ReferenceType> cache, File file, ReferenceContainerArray<ReferenceType> array) {
if (dumpQueue == null || !this.isAlive()) { if (dumpQueue == null || !this.isAlive()) {
cache.dump(file, true); cache.dump(file);
} else { } else {
DumpJob job = new DumpJob(cache, file, array); DumpJob job = new DumpJob(cache, file, array);
try { try {
@ -88,7 +88,7 @@ public class IODispatcher <ReferenceType extends Reference> extends Thread {
controlQueue.put(vita); controlQueue.put(vita);
} catch (InterruptedException e) { } catch (InterruptedException e) {
e.printStackTrace(); e.printStackTrace();
cache.dump(file, true); cache.dump(file);
} }
} }
} }
@ -161,7 +161,7 @@ public class IODispatcher <ReferenceType extends Reference> extends Thread {
} }
public void dump() { public void dump() {
try { try {
cache.dump(file, true); cache.dump(file);
array.mountBLOBFile(file); array.mountBLOBFile(file);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();

@ -322,7 +322,7 @@ public final class IndexBuffer<ReferenceType extends Reference> extends Abstract
} }
public synchronized void close() { public synchronized void close() {
heap.dump(this.dumpFile, true); heap.dump(this.dumpFile);
heap = null; heap = null;
hashScore.clear(); hashScore.clear();
hashDate.clear(); hashDate.clear();

@ -247,7 +247,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
* and is composed of the current date and the cell salt * and is composed of the current date and the cell salt
*/ */
public synchronized void close() { public synchronized void close() {
this.ram.dump(this.array.newContainerBLOBFile(), true); this.ram.dump(this.array.newContainerBLOBFile());
// close all // close all
this.ram.close(); this.ram.close();
this.array.close(); this.array.close();

@ -30,6 +30,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@ -41,6 +42,7 @@ import de.anomic.kelondro.blob.HeapWriter;
import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.util.ByteArray;
import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.Row;
@ -50,7 +52,8 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
private final Row payloadrow; private final Row payloadrow;
private final ByteOrder termOrder; private final ByteOrder termOrder;
private SortedMap<byte[], ReferenceContainer<ReferenceType>> cache; //private SortedMap<byte[], ReferenceContainer<ReferenceType>> cache;
private Map<ByteArray, ReferenceContainer<ReferenceType>> cache;
/** /**
* opens an existing heap file in undefined mode * opens an existing heap file in undefined mode
@ -84,7 +87,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
* another dump reading afterwards is not possible * another dump reading afterwards is not possible
*/ */
public void initWriteMode() { public void initWriteMode() {
this.cache = Collections.synchronizedSortedMap(new TreeMap<byte[], ReferenceContainer<ReferenceType>>(this.termOrder)); this.cache = Collections.synchronizedMap(new HashMap<ByteArray, ReferenceContainer<ReferenceType>>());
} }
/** /**
@ -95,14 +98,15 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public void initWriteModeFromBLOB(final File blobFile) throws IOException { public void initWriteModeFromBLOB(final File blobFile) throws IOException {
Log.logInfo("indexContainerRAMHeap", "restoring rwi blob dump '" + blobFile.getName() + "'"); Log.logInfo("indexContainerRAMHeap", "restoring rwi blob dump '" + blobFile.getName() + "'");
final long start = System.currentTimeMillis(); final long start = System.currentTimeMillis();
this.cache = Collections.synchronizedSortedMap(new TreeMap<byte[], ReferenceContainer<ReferenceType>>(this.termOrder)); //this.cache = Collections.synchronizedSortedMap(new TreeMap<byte[], ReferenceContainer<ReferenceType>>(this.termOrder));
this.cache = new HashMap<ByteArray, ReferenceContainer<ReferenceType>>();
int urlCount = 0; int urlCount = 0;
synchronized (cache) { synchronized (cache) {
for (final ReferenceContainer<ReferenceType> container : new blobFileEntries<ReferenceType>(blobFile, factory, this.payloadrow)) { for (final ReferenceContainer<ReferenceType> container : new blobFileEntries<ReferenceType>(blobFile, factory, this.payloadrow)) {
// TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low? // TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
if (container == null) break; if (container == null) break;
//System.out.println("***DEBUG indexContainerHeap.initwriteModeFromBLOB*** container.size = " + container.size() + ", container.sorted = " + container.sorted()); //System.out.println("***DEBUG indexContainerHeap.initwriteModeFromBLOB*** container.size = " + container.size() + ", container.sorted = " + container.sorted());
cache.put(container.getTermHash(), container); cache.put(new ByteArray(container.getTermHash()), container);
urlCount += container.size(); urlCount += container.size();
} }
} }
@ -111,7 +115,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
Log.logInfo("indexContainerRAMHeap", "finished rwi blob restore: " + cache.size() + " words, " + urlCount + " word/URL relations in " + (System.currentTimeMillis() - start) + " milliseconds"); Log.logInfo("indexContainerRAMHeap", "finished rwi blob restore: " + cache.size() + " words, " + urlCount + " word/URL relations in " + (System.currentTimeMillis() - start) + " milliseconds");
} }
public void dump(final File heapFile, boolean writeIDX) { public void dump(final File heapFile) {
assert this.cache != null; assert this.cache != null;
Log.logInfo("indexContainerRAMHeap", "creating rwi heap dump '" + heapFile.getName() + "', " + cache.size() + " rwi's"); Log.logInfo("indexContainerRAMHeap", "creating rwi heap dump '" + heapFile.getName() + "', " + cache.size() + " rwi's");
if (heapFile.exists()) FileUtils.deletedelete(heapFile); if (heapFile.exists()) FileUtils.deletedelete(heapFile);
@ -124,13 +128,15 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
return; return;
} }
final long startTime = System.currentTimeMillis(); final long startTime = System.currentTimeMillis();
// sort the map
SortedMap<byte[], ReferenceContainer<ReferenceType>> cachecopy = sortedClone();
// write wCache
long wordcount = 0, urlcount = 0; long wordcount = 0, urlcount = 0;
byte[] wordHash = null, lwh; byte[] wordHash = null, lwh;
ReferenceContainer<ReferenceType> container; ReferenceContainer<ReferenceType> container;
for (final Map.Entry<byte[], ReferenceContainer<ReferenceType>> entry: cachecopy.entrySet()) {
// write wCache
synchronized (cache) {
for (final Map.Entry<byte[], ReferenceContainer<ReferenceType>> entry: cache.entrySet()) {
// get entries // get entries
lwh = wordHash; lwh = wordHash;
wordHash = entry.getKey(); wordHash = entry.getKey();
@ -151,9 +157,8 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
} }
wordcount++; wordcount++;
} }
}
try { try {
dump.close(writeIDX); dump.close(true);
Log.logInfo("indexContainerRAMHeap", "finished rwi heap dump: " + wordcount + " words, " + urlcount + " word/URL relations in " + (System.currentTimeMillis() - startTime) + " milliseconds"); Log.logInfo("indexContainerRAMHeap", "finished rwi heap dump: " + wordcount + " words, " + urlcount + " word/URL relations in " + (System.currentTimeMillis() - startTime) + " milliseconds");
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
@ -163,6 +168,17 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
} }
} }
public SortedMap<byte[], ReferenceContainer<ReferenceType>> sortedClone() {
SortedMap<byte[], ReferenceContainer<ReferenceType>> cachecopy;
synchronized (cache) {
cachecopy = new TreeMap<byte[], ReferenceContainer<ReferenceType>>(this.termOrder);
for (final Map.Entry<ByteArray, ReferenceContainer<ReferenceType>> entry: cache.entrySet()) {
cachecopy.put(entry.getKey().asBytes(), entry.getValue());
}
}
return cachecopy;
}
public int size() { public int size() {
return (this.cache == null) ? 0 : this.cache.size(); return (this.cache == null) ? 0 : this.cache.size();
} }
@ -317,11 +333,14 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
private final boolean rot; private final boolean rot;
private Iterator<ReferenceContainer<ReferenceType>> iterator; private Iterator<ReferenceContainer<ReferenceType>> iterator;
private byte[] latestTermHash;
public heapCacheIterator(byte[] startWordHash, final boolean rot) { public heapCacheIterator(byte[] startWordHash, final boolean rot) {
this.rot = rot; this.rot = rot;
if (startWordHash != null && startWordHash.length == 0) startWordHash = null; if (startWordHash != null && startWordHash.length == 0) startWordHash = null;
this.iterator = (startWordHash == null) ? cache.values().iterator() : cache.tailMap(startWordHash).values().iterator(); SortedMap<byte[], ReferenceContainer<ReferenceType>> cachecopy = sortedClone();
this.iterator = (startWordHash == null) ? cachecopy.values().iterator() : cachecopy.tailMap(startWordHash).values().iterator();
this.latestTermHash = null;
// The collection's iterator will return the values in the order that their corresponding keys appear in the tree. // The collection's iterator will return the values in the order that their corresponding keys appear in the tree.
} }
@ -336,18 +355,23 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public ReferenceContainer<ReferenceType> next() { public ReferenceContainer<ReferenceType> next() {
if (iterator.hasNext()) { if (iterator.hasNext()) {
return (iterator.next()).topLevelClone(); ReferenceContainer<ReferenceType> c = iterator.next();
this.latestTermHash = c.getTermHash();
return c.topLevelClone();
} }
// rotation iteration // rotation iteration
if (!rot) { if (!rot) {
return null; return null;
} }
iterator = cache.values().iterator(); iterator = cache.values().iterator();
return (iterator.next()).topLevelClone(); ReferenceContainer<ReferenceType> c = iterator.next();
this.latestTermHash = c.getTermHash();
return c.topLevelClone();
} }
public void remove() { public void remove() {
iterator.remove(); iterator.remove();
cache.remove(new ByteArray(this.latestTermHash));
} }
public Iterator<ReferenceContainer<ReferenceType>> iterator() { public Iterator<ReferenceContainer<ReferenceType>> iterator() {
@ -363,7 +387,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
* @return true, if the key is used in the heap; false othervise * @return true, if the key is used in the heap; false othervise
*/ */
public boolean has(final byte[] key) { public boolean has(final byte[] key) {
return this.cache.containsKey(key); return this.cache.containsKey(new ByteArray(key));
} }
/** /**
@ -372,8 +396,8 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
* @return the indexContainer if one exist, null otherwise * @return the indexContainer if one exist, null otherwise
*/ */
public ReferenceContainer<ReferenceType> get(final byte[] key, Set<String> urlselection) { public ReferenceContainer<ReferenceType> get(final byte[] key, Set<String> urlselection) {
if (urlselection == null) return this.cache.get(key); if (urlselection == null) return this.cache.get(new ByteArray(key));
ReferenceContainer<ReferenceType> c = this.cache.get(key); ReferenceContainer<ReferenceType> c = this.cache.get(new ByteArray(key));
if (c == null) return null; if (c == null) return null;
// because this is all in RAM, we must clone the entries (flat) // because this is all in RAM, we must clone the entries (flat)
ReferenceContainer<ReferenceType> c1 = new ReferenceContainer<ReferenceType>(factory, c.getTermHash(), c.row(), c.size()); ReferenceContainer<ReferenceType> c1 = new ReferenceContainer<ReferenceType>(factory, c.getTermHash(), c.row(), c.size());
@ -392,7 +416,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
* @return * @return
*/ */
public int count(final byte[] key) { public int count(final byte[] key) {
ReferenceContainer<ReferenceType> c = this.cache.get(key); ReferenceContainer<ReferenceType> c = this.cache.get(new ByteArray(key));
if (c == null) return 0; if (c == null) return 0;
return c.size(); return c.size();
} }
@ -405,18 +429,19 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public synchronized ReferenceContainer<ReferenceType> delete(final byte[] termHash) { public synchronized ReferenceContainer<ReferenceType> delete(final byte[] termHash) {
// returns the index that had been deleted // returns the index that had been deleted
assert this.cache != null; assert this.cache != null;
return cache.remove(termHash); return cache.remove(new ByteArray(termHash));
} }
public synchronized boolean remove(final byte[] termHash, final String urlHash) { public synchronized boolean remove(final byte[] termHash, final String urlHash) {
assert this.cache != null; assert this.cache != null;
final ReferenceContainer<ReferenceType> c = cache.get(termHash); ByteArray tha = new ByteArray(termHash);
final ReferenceContainer<ReferenceType> c = cache.get(tha);
if ((c != null) && (c.remove(urlHash) != null)) { if ((c != null) && (c.remove(urlHash) != null)) {
// removal successful // removal successful
if (c.size() == 0) { if (c.size() == 0) {
delete(termHash); delete(termHash);
} else { } else {
cache.put(termHash, c); cache.put(tha, c);
} }
return true; return true;
} }
@ -426,14 +451,15 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public synchronized int remove(final byte[] termHash, final Set<String> urlHashes) { public synchronized int remove(final byte[] termHash, final Set<String> urlHashes) {
assert this.cache != null; assert this.cache != null;
if (urlHashes.size() == 0) return 0; if (urlHashes.size() == 0) return 0;
final ReferenceContainer<ReferenceType> c = cache.get(termHash); ByteArray tha = new ByteArray(termHash);
final ReferenceContainer<ReferenceType> c = cache.get(tha);
int count; int count;
if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) { if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) {
// removal successful // removal successful
if (c.size() == 0) { if (c.size() == 0) {
delete(termHash); delete(termHash);
} else { } else {
cache.put(termHash, c); cache.put(tha, c);
} }
return count; return count;
} }
@ -445,8 +471,8 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
if (this.cache == null || container == null || container.size() == 0) return; if (this.cache == null || container == null || container.size() == 0) return;
// put new words into cache // put new words into cache
final byte[] termHash = container.getTermHash(); ByteArray tha = new ByteArray(container.getTermHash());
ReferenceContainer<ReferenceType> entries = cache.get(termHash); // null pointer exception? wordhash != null! must be cache==null ReferenceContainer<ReferenceType> entries = cache.get(tha); // null pointer exception? wordhash != null! must be cache==null
int added = 0; int added = 0;
if (entries == null) { if (entries == null) {
entries = container.topLevelClone(); entries = container.topLevelClone();
@ -455,7 +481,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
added = entries.putAllRecent(container); added = entries.putAllRecent(container);
} }
if (added > 0) { if (added > 0) {
cache.put(termHash, entries); cache.put(tha, entries);
} }
entries = null; entries = null;
return; return;
@ -463,10 +489,11 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
public synchronized void add(final byte[] termHash, final ReferenceType newEntry) { public synchronized void add(final byte[] termHash, final ReferenceType newEntry) {
assert this.cache != null; assert this.cache != null;
ReferenceContainer<ReferenceType> container = cache.get(termHash); ByteArray tha = new ByteArray(termHash);
ReferenceContainer<ReferenceType> container = cache.get(tha);
if (container == null) container = new ReferenceContainer<ReferenceType>(factory, termHash, this.payloadrow, 1); if (container == null) container = new ReferenceContainer<ReferenceType>(factory, termHash, this.payloadrow, 1);
container.put(newEntry); container.put(newEntry);
cache.put(termHash, container); cache.put(tha, container);
} }
public int minMem() { public int minMem() {

@ -34,6 +34,7 @@ import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.MicroDate; import de.anomic.kelondro.order.MicroDate;
import de.anomic.kelondro.text.AbstractReference; import de.anomic.kelondro.text.AbstractReference;
import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.Reference;
import de.anomic.plasma.parser.Word;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
public final class WordReferenceRow extends AbstractReference implements WordReference, Cloneable { public final class WordReferenceRow extends AbstractReference implements WordReference, Cloneable {
@ -147,6 +148,48 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_reserve2, 0); this.entry.setCol(col_reserve2, 0);
} }
public WordReferenceRow(final String urlHash,
final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components
final int titleLength, // length of description/length (longer are better?)
final int wordcount, // total number of words
final int phrasecount, // total number of phrases
final long lastmodified, // last-modified time of the document where word appears
final long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
final String language, // (guessed) language of document
final char doctype, // type of document
final int outlinksSame, // outlinks to same domain
final int outlinksOther // outlinks to other domain
) {
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
this.entry = urlEntryRow.newEntry();
final int mddlm = MicroDate.microDateDays(lastmodified);
final int mddct = MicroDate.microDateDays(updatetime);
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
this.entry.setCol(col_wordsInText, wordcount);
this.entry.setCol(col_phrasesInText, phrasecount);
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
this.entry.setCol(col_language, ((language == null) || (language.length() != urlEntryRow.width(col_language))) ? "uk" : language, null);
this.entry.setCol(col_llocal, outlinksSame);
this.entry.setCol(col_lother, outlinksOther);
this.entry.setCol(col_urlLength, urlLength);
this.entry.setCol(col_urlComps, urlComps);
this.entry.setCol(col_reserve1, 0);
this.entry.setCol(col_reserve2, 0);
}
public void setWord(final Word word) {
this.entry.setCol(col_typeofword, new byte[]{(byte) 0});
this.entry.setCol(col_flags, word.flags.bytes());
this.entry.setCol(col_hitcount, word.count);
this.entry.setCol(col_posintext, word.posInText);
this.entry.setCol(col_posinphrase, word.posInPhrase);
this.entry.setCol(col_posofphrase, word.numOfPhrase);
}
public WordReferenceRow(final String urlHash, final String code) { public WordReferenceRow(final String urlHash, final String code) {
// the code is the external form of the row minus the leading urlHash entry // the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes()); this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());

@ -443,30 +443,25 @@ public final class plasmaWordIndex {
final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator(); final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator();
Map.Entry<String, Word> wentry; Map.Entry<String, Word> wentry;
String word; String word;
WordReferenceRow ientry;
Word wprop;
int len = (document == null) ? urlLength : document.dc_title().length(); int len = (document == null) ? urlLength : document.dc_title().length();
while (i.hasNext()) { WordReferenceRow ientry = new WordReferenceRow(url.hash(),
wentry = i.next();
word = wentry.getKey();
wprop = wentry.getValue();
assert (wprop.flags != null);
ientry = new WordReferenceRow(url.hash(),
urlLength, urlComps, len, urlLength, urlComps, len,
wprop.count,
condenser.RESULT_NUMB_WORDS, condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_SENTENCES, condenser.RESULT_NUMB_SENTENCES,
wprop.posInText,
wprop.posInPhrase,
wprop.numOfPhrase,
urlModified.getTime(), urlModified.getTime(),
System.currentTimeMillis(), System.currentTimeMillis(),
language, language,
doctype, doctype,
outlinksSame, outlinksOther, outlinksSame, outlinksOther);
wprop.flags); Word wprop;
while (i.hasNext()) {
wentry = i.next();
word = wentry.getKey();
wprop = wentry.getValue();
assert (wprop.flags != null);
ientry.setWord(wprop);
try { try {
this.index.add(Word.word2hash(word), ientry); // TODO: remove getBytes() this.index.add(Word.word2hash(word), ientry);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }

Loading…
Cancel
Save