From a9cf6cf2f43b175449ee3352f66e7c009dd744df Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 6 Apr 2008 20:31:16 +0000 Subject: [PATCH] generalization of index container-heap class. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4654 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../de/anomic/index/indexContainerHeap.java | 489 ++++++++++++++++-- source/de/anomic/index/indexRAMRI.java | 281 +++------- source/de/anomic/kelondro/kelondroRowSet.java | 3 +- 3 files changed, 512 insertions(+), 261 deletions(-) diff --git a/source/de/anomic/index/indexContainerHeap.java b/source/de/anomic/index/indexContainerHeap.java index 67e2ace7a..c1411e94e 100755 --- a/source/de/anomic/index/indexContainerHeap.java +++ b/source/de/anomic/index/indexContainerHeap.java @@ -34,28 +34,84 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.io.RandomAccessFile; import java.util.Collections; +import java.util.Iterator; import java.util.Map; +import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; +import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroBufferedRA; import de.anomic.kelondro.kelondroByteOrder; import de.anomic.kelondro.kelondroBytesIntMap; +import de.anomic.kelondro.kelondroCloneableIterator; +import de.anomic.kelondro.kelondroException; +import de.anomic.kelondro.kelondroFixedWidthArray; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRowSet; +import de.anomic.kelondro.kelondroRow.EntryIndex; +import de.anomic.server.serverMemory; import de.anomic.server.logging.serverLog; +import de.anomic.yacy.yacySeedDB; public final class indexContainerHeap { - - public indexContainerHeap(File databaseRoot, kelondroRow payloadrow, String dumpname, serverLog log) { + private kelondroRow payloadrow; + private serverLog log; + private kelondroBytesIntMap index; + private SortedMap cache; + private File heapFile; + // index xor cache is used. If one is not null, then the other must be null + + /* + * An indexContainerHeap is a caching structure for indexContainer objects + * A heap can have the following stati: + * write: the heap can be extended with more indexContainer entries. + * a heap that is open to be written may be dumped to a heap file. + * after that, the heap is still accessible, but only in read-status, + * which is not reversible. Once a heap is dumped, it can never be extended with new + * indexConatiner entries. + * A write-heap can also initiated using a restore of a dumped heap. + * read: a dumped head can be accessed using a heap index. when the heap is + * accessed the first time, all entries are scanned and an index is computed + */ + + /** + * open a new container heap and prepare it as a heap to be written + * @param payloadrow + * @param log + */ + public indexContainerHeap(kelondroRow payloadrow, serverLog log) { + this.payloadrow = payloadrow; + this.log = log; + this.cache = Collections.synchronizedSortedMap(new TreeMap(new kelondroByteOrder.StringOrder(payloadrow.getOrdering()))); + this.index = null; + this.heapFile = null; } - - public static void dumpHeap(File indexHeapFile, kelondroRow payloadrow, SortedMap cache, serverLog log) throws IOException { - if (log != null) log.logInfo("creating rwi heap dump '" + indexHeapFile.getName() + "', " + cache.size() + " rwi's"); - if (indexHeapFile.exists()) indexHeapFile.delete(); - OutputStream os = new BufferedOutputStream(new FileOutputStream(indexHeapFile), 64 * 1024); + /** + * opens an existing heap file in read-only mode + * @param indexHeapFile + * @param payloadrow + * @param log + */ + public indexContainerHeap(kelondroRow payloadrow, serverLog log, File heapFile) { + this.payloadrow = payloadrow; + this.log = log; + this.cache = null; + this.index = null; + this.heapFile = heapFile; + } + + public void dumpHeap(File heapFile) throws IOException { + assert this.heapFile == null; + assert this.cache != null; + this.heapFile = heapFile; + if (log != null) log.logInfo("creating rwi heap dump '" + heapFile.getName() + "', " + cache.size() + " rwi's"); + if (heapFile.exists()) heapFile.delete(); + OutputStream os = new BufferedOutputStream(new FileOutputStream(heapFile), 64 * 1024); long startTime = System.currentTimeMillis(); long wordcount = 0, urlcount = 0; String wordHash; @@ -83,62 +139,405 @@ public final class indexContainerHeap { os.flush(); os.close(); if (log != null) log.logInfo("finished rwi heap dump: " + wordcount + " words, " + urlcount + " word/URL relations in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds"); + + // finally delete the internal cache to switch handling to read-only mode + this.cache = null; } + + public int size() { + assert this.cache != null; + return this.cache.size(); + } + + /** + * static iterator of heap files: is used to import heap dumps into a write-enabled index heap + */ + public static class heapFileEntries implements Iterator, Iterable { + DataInputStream is; + byte[] word; + kelondroRow payloadrow; + + public heapFileEntries(File heapFile, kelondroRow payloadrow) throws IOException { + if (!(heapFile.exists())) throw new IOException("file " + heapFile + " does not exist"); + is = new DataInputStream(new BufferedInputStream(new FileInputStream(heapFile), 64*1024)); + word = new byte[payloadrow.primaryKeyLength]; + this.payloadrow = payloadrow; + } + + public boolean hasNext() { + try { + return is.available() > 0; + } catch (IOException e) { + return false; + } + } + + public indexContainer next() { + try { + is.read(word); + return new indexContainer(new String(word), kelondroRowSet.importRowSet(is, payloadrow)); + } catch (IOException e) { + return null; + } + } - public static SortedMap restoreHeap(File indexHeapFile, kelondroRow payloadrow, serverLog log) throws IOException { - if (!(indexHeapFile.exists())) throw new IOException("file " + indexHeapFile + " does not exist"); - if (log != null) log.logInfo("restoring dump for rwi heap '" + indexHeapFile.getName() + "'"); + public void remove() { + throw new UnsupportedOperationException("heap dumps are read-only"); + } + + public Iterator iterator() { + return this; + } - long start = System.currentTimeMillis(); - SortedMap cache = Collections.synchronizedSortedMap(new TreeMap(new kelondroByteOrder.StringOrder(payloadrow.getOrdering()))); - DataInputStream is = new DataInputStream(new BufferedInputStream(new FileInputStream(indexHeapFile), 64*1024)); + public void close() { + if (is != null) try { is.close(); } catch (IOException e) {} + is = null; + } - long urlCount = 0; - String wordHash; - byte[] word = new byte[payloadrow.primaryKeyLength]; - while (is.available() > 0) { - // read word - is.read(word); - wordHash = new String(word); - // read collection - - indexContainer container = new indexContainer(wordHash, kelondroRowSet.importRowSet(is, payloadrow)); - cache.put(wordHash, container); - urlCount += container.size(); + public void finalize() { + this.close(); } - is.close(); - if (log != null) log.logInfo("finished rwi heap restore: " + cache.size() + " words, " + urlCount + " word/URL relations in " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); - return cache; } - public static kelondroBytesIntMap indexHeap(File indexHeapFile, kelondroRow payloadrow, serverLog log) throws IOException { - if (!(indexHeapFile.exists())) throw new IOException("file " + indexHeapFile + " does not exist"); - if (indexHeapFile.length() >= (long) Integer.MAX_VALUE) throw new IOException("file " + indexHeapFile + " too large, index can only be crated for files less than 2GB"); - if (log != null) log.logInfo("creating index for rwi heap '" + indexHeapFile.getName() + "'"); + /** + * restore a heap dump: this is a heap in write mode. There should no heap file + * be assigned in initialization; the file name is given here in this method + * when the heap is once dumped again, the heap file name may be different + * @param heapFile + * @throws IOException + */ + public void restoreHeap(File heapFile) throws IOException { + assert this.heapFile == null; // the heap must be opened on write-mode + if (log != null) log.logInfo("restoring dump for rwi heap '" + heapFile.getName() + "'"); long start = System.currentTimeMillis(); - kelondroBytesIntMap index = new kelondroBytesIntMap(payloadrow.primaryKeyLength, (kelondroByteOrder) payloadrow.getOrdering(), 0); - DataInputStream is = new DataInputStream(new BufferedInputStream(new FileInputStream(indexHeapFile), 64*1024)); + this.cache = Collections.synchronizedSortedMap(new TreeMap(new kelondroByteOrder.StringOrder(payloadrow.getOrdering()))); + int urlCount = 0; + synchronized (cache) { + for (indexContainer container : new heapFileEntries(heapFile, this.payloadrow)) { + cache.put(container.getWordHash(), container); + urlCount += container.size(); + } + } + if (log != null) log.logInfo("finished rwi heap restore: " + cache.size() + " words, " + urlCount + " word/URL relations in " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); + } + + private void indexHeap() throws IOException { + assert this.cache == null; + if (this.index != null) return; + if (!(heapFile.exists())) throw new IOException("file " + heapFile + " does not exist"); + if (heapFile.length() >= (long) Integer.MAX_VALUE) throw new IOException("file " + heapFile + " too large, index can only be crated for files less than 2GB"); + if (log != null) log.logInfo("creating index for rwi heap '" + heapFile.getName() + "'"); + long start = System.currentTimeMillis(); + this.index = new kelondroBytesIntMap(payloadrow.primaryKeyLength, (kelondroByteOrder) payloadrow.getOrdering(), 0); + DataInputStream is = null; long urlCount = 0; String wordHash; byte[] word = new byte[payloadrow.primaryKeyLength]; int seek = 0, seek0; - while (is.available() > 0) { - // remember seek position - seek0 = seek; + synchronized (index) { + is = new DataInputStream(new BufferedInputStream(new FileInputStream(heapFile), 64*1024)); + + while (is.available() > 0) { + // remember seek position + seek0 = seek; - // read word - is.read(word); - wordHash = new String(word); - seek += wordHash.length(); + // read word + is.read(word); + wordHash = new String(word); + seek += wordHash.length(); - // read collection - seek += kelondroRowSet.skipNextRowSet(is, payloadrow); - index.addi(word, seek0); + // read collection + seek += kelondroRowSet.skipNextRowSet(is, payloadrow); + index.addi(word, seek0); + } } is.close(); if (log != null) log.logInfo("finished rwi heap indexing: " + urlCount + " word/URL relations in " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); - return index; } + + /** + * return an iterator object that creates top-level-clones of the indexContainers + * in the cache, so that manipulations of the iterated objects do not change + * objects in the cache. + */ + public synchronized kelondroCloneableIterator wordContainers(String startWordHash, boolean rot) { + return new heapCacheIterator(startWordHash, rot); + } + + /** + * cache iterator: iterates objects within the heap cache. This can only be used + * for write-enabled heaps, read-only heaps do not have a heap cache + */ + public class heapCacheIterator implements kelondroCloneableIterator, Iterable { + + // this class exists, because the wCache cannot be iterated with rotation + // and because every indexContainer Object that is iterated must be returned as top-level-clone + // so this class simulates wCache.tailMap(startWordHash).values().iterator() + // plus the mentioned features + + private boolean rot; + private Iterator iterator; + + public heapCacheIterator(String startWordHash, boolean rot) { + this.rot = rot; + this.iterator = (startWordHash == null) ? cache.values().iterator() : cache.tailMap(startWordHash).values().iterator(); + // The collection's iterator will return the values in the order that their corresponding keys appear in the tree. + } + + public heapCacheIterator clone(Object secondWordHash) { + return new heapCacheIterator((String) secondWordHash, rot); + } + + public boolean hasNext() { + if (rot) return true; + return iterator.hasNext(); + } + + public indexContainer next() { + if (iterator.hasNext()) { + return ((indexContainer) iterator.next()).topLevelClone(); + } else { + // rotation iteration + if (rot) { + iterator = cache.values().iterator(); + return ((indexContainer) iterator.next()).topLevelClone(); + } else { + return null; + } + } + } + + public void remove() { + iterator.remove(); + } + + public Iterator iterator() { + return this; + } + + } + + /** + * test if a given key is in the heap + * this works with heaps in write- and read-mode + * @param key + * @return true, if the key is used in the heap; false othervise + */ + public boolean has(String key) { + if (this.cache == null) try { + if (!(heapFile.exists())) throw new IOException("file " + heapFile + " does not exist"); + if (index == null) indexHeap(); + assert index != null; + assert index.row().primaryKeyLength == key.length(); + + // check if the index contains the key + return index.geti(key.getBytes()) >= 0; + } catch (IOException e) { + log.logSevere("error accessing entry in heap file " + this.heapFile + ": " + e.getMessage()); + return false; + } else { + return this.cache.containsKey(key); + } + } + + /** + * get a indexContainer from a heap + * @param key + * @return the indexContainer if one exist, null otherwise + */ + public indexContainer get(String key) { + if (this.cache == null) try { + if (!(heapFile.exists())) throw new IOException("file " + heapFile + " does not exist"); + if (index == null) indexHeap(); + assert index != null; + assert index.row().primaryKeyLength == key.length(); + + // check if the index contains the key + int pos = index.geti(key.getBytes()); + if (pos < 0) return null; + + // access the file and read the container + RandomAccessFile raf = new RandomAccessFile(heapFile, "r"); + byte[] word = new byte[index.row().primaryKeyLength]; + + raf.seek(pos); + raf.read(word); + assert key.equals(new String(word)); + + // read collection + indexContainer container = new indexContainer(key, kelondroRowSet.importRowSet(raf, payloadrow)); + raf.close(); + return container; + } catch (IOException e) { + log.logSevere("error accessing entry in heap file " + this.heapFile + ": " + e.getMessage()); + return null; + } else { + return this.cache.get(key); + } + } + + /** + * delete a indexContainer from the heap cache. This can only be used for write-enabled heaps + * @param wordHash + * @return the indexContainer if the cache contained the container, null othervise + */ + public synchronized indexContainer delete(String wordHash) { + // returns the index that had been deleted + assert this.cache != null; + return cache.remove(wordHash); + } + + + public synchronized boolean removeReference(String wordHash, String urlHash) { + indexContainer c = (indexContainer) cache.get(wordHash); + if ((c != null) && (c.remove(urlHash) != null)) { + // removal successful + if (c.size() == 0) { + delete(wordHash); + } else { + cache.put(wordHash, c); + } + return true; + } + return false; + } + + public synchronized int removeReference(String urlHash) { + // this tries to delete an index from the cache that has this + // urlHash assigned. This can only work if the entry is really fresh + // Such entries must be searched in the latest entries + int delCount = 0; + Iterator> i = cache.entrySet().iterator(); + Map.Entry entry; + String wordhash; + indexContainer c; + while (i.hasNext()) { + entry = i.next(); + wordhash = entry.getKey(); + + // get container + c = entry.getValue(); + if (c.remove(urlHash) != null) { + if (c.size() == 0) { + i.remove(); + } else { + cache.put(wordhash, c); // superfluous? + } + delCount++; + } + } + return delCount; + } + + public synchronized int removeReferences(String wordHash, Set urlHashes) { + if (urlHashes.size() == 0) return 0; + indexContainer c = (indexContainer) cache.get(wordHash); + int count; + if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) { + // removal successful + if (c.size() == 0) { + delete(wordHash); + } else { + cache.put(wordHash, c); + } + return count; + } + return 0; + } + + public synchronized int add(indexContainer container) { + // this puts the entries into the cache + int added = 0; + if ((container == null) || (container.size() == 0)) return 0; + + // put new words into cache + String wordHash = container.getWordHash(); + indexContainer entries = (indexContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null + if (entries == null) { + entries = container.topLevelClone(); + added = entries.size(); + } else { + added = entries.putAllRecent(container); + } + if (added > 0) { + cache.put(wordHash, entries); + } + entries = null; + return added; + } + + public synchronized void addEntry(String wordHash, indexRWIRowEntry newEntry) { + indexContainer container = (indexContainer) cache.get(wordHash); + if (container == null) container = new indexContainer(wordHash, this.payloadrow, 1); + container.put(newEntry); + cache.put(wordHash, container); + } + + /** + * this is a compatibility method for a old heap dump format. don't use it if not necessary + * @param indexArrayFile + * @throws IOException + */ + public void restoreArray(File indexArrayFile) throws IOException { + // is only here to read old array data structures + if (!(indexArrayFile.exists())) return; + kelondroFixedWidthArray dumpArray; + kelondroBufferedRA readBuffer = null; + kelondroRow bufferStructureBasis = new kelondroRow( + "byte[] wordhash-" + yacySeedDB.commonHashLength + ", " + + "Cardinal occ-4 {b256}, " + + "Cardinal time-8 {b256}, " + + "byte[] urlprops-" + payloadrow.objectsize, + kelondroBase64Order.enhancedCoder, 0); + dumpArray = new kelondroFixedWidthArray(indexArrayFile, bufferStructureBasis, 0); + log.logInfo("started restore of ram cache '" + indexArrayFile.getName() + "', " + dumpArray.size() + " word/URL relations"); + long startTime = System.currentTimeMillis(); + long messageTime = System.currentTimeMillis() + 5000; + long urlCount = 0, urlsPerSecond = 0; + this.cache = Collections.synchronizedSortedMap(new TreeMap(new kelondroByteOrder.StringOrder(payloadrow.getOrdering()))); + try { + Iterator i = dumpArray.contentRows(-1); + String wordHash; + //long creationTime; + indexRWIRowEntry wordEntry; + kelondroRow.EntryIndex row; + while (i.hasNext()) { + // get out one entry + row = i.next(); + if ((row == null) || (row.empty(0)) || (row.empty(3))) continue; + wordHash = row.getColString(0, "UTF-8"); + //creationTime = kelondroRecords.bytes2long(row[2]); + wordEntry = new indexRWIRowEntry(row.getColBytes(3)); + + // store to cache + indexContainer container = cache.get(wordHash); + if (container == null) container = new indexContainer(wordHash, payloadrow, 1); + container.put(wordEntry); + cache.put(wordHash, container); + + urlCount++; + // protect against memory shortage + //while (serverMemory.free() < 1000000) {flushFromMem(); java.lang.System.gc();} + // write a log + if (System.currentTimeMillis() > messageTime) { + serverMemory.gc(1000, "indexRAMRI, for better statistic-2"); // for better statistic - thq + urlsPerSecond = 1 + urlCount * 1000 / (1 + System.currentTimeMillis() - startTime); + log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpArray.size() - urlCount) / urlsPerSecond) + " seconds remaining, free mem = " + (serverMemory.free() / 1024 / 1024) + "MB"); + messageTime = System.currentTimeMillis() + 5000; + } + } + if (readBuffer != null) readBuffer.close(); + dumpArray.close(); + dumpArray = null; + log.logInfo("finished restore: " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds"); + } catch (kelondroException e) { + // restore failed + log.logSevere("failed restore of indexCache array dump: " + e.getMessage(), e); + } finally { + if (dumpArray != null) try {dumpArray.close();}catch(Exception e){} + } + } + } diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index c569f627b..e82a7b09c 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -28,30 +28,17 @@ package de.anomic.index; import java.io.File; import java.io.IOException; -import java.util.Collections; -import java.util.Iterator; -import java.util.Map; import java.util.Set; -import java.util.SortedMap; -import java.util.TreeMap; -import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroBufferedRA; -import de.anomic.kelondro.kelondroByteOrder; import de.anomic.kelondro.kelondroCloneableIterator; -import de.anomic.kelondro.kelondroException; -import de.anomic.kelondro.kelondroFixedWidthArray; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroRow; -import de.anomic.kelondro.kelondroRow.EntryIndex; import de.anomic.server.serverMemory; import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacySeedDB; public final class indexRAMRI implements indexRI, indexRIReader { // class variables - protected SortedMap cache; // wordhash-container private final kelondroMScoreCluster hashScore; private final kelondroMScoreCluster hashDate; private long initTime; @@ -59,10 +46,10 @@ public final class indexRAMRI implements indexRI, indexRIReader { public int cacheReferenceCountLimit; public long cacheReferenceAgeLimit; private final serverLog log; - private File indexArrayFile, indexHeapFile; - private kelondroRow payloadrow; - private kelondroRow bufferStructureBasis; + private File indexHeapFile; + private indexContainerHeap heap; + @SuppressWarnings("unchecked") public indexRAMRI(File databaseRoot, kelondroRow payloadrow, int wCacheReferenceCountLimitInit, long wCacheReferenceAgeLimitInit, String oldArrayName, String newHeapName, serverLog log) { // creates a new index cache @@ -74,36 +61,33 @@ public final class indexRAMRI implements indexRI, indexRIReader { this.cacheReferenceCountLimit = wCacheReferenceCountLimitInit; this.cacheReferenceAgeLimit = wCacheReferenceAgeLimitInit; this.log = log; - this.indexArrayFile = new File(databaseRoot, oldArrayName); + File indexArrayFile = new File(databaseRoot, oldArrayName); this.indexHeapFile = new File(databaseRoot, newHeapName); - this.payloadrow = payloadrow; - this.bufferStructureBasis = new kelondroRow( - "byte[] wordhash-" + yacySeedDB.commonHashLength + ", " + - "Cardinal occ-4 {b256}, " + - "Cardinal time-8 {b256}, " + - "byte[] urlprops-" + payloadrow.objectsize, - kelondroBase64Order.enhancedCoder, 0); + this.heap = new indexContainerHeap(payloadrow, log); // read in dump of last session if (indexArrayFile.exists()) { - this.cache = Collections.synchronizedSortedMap(new TreeMap(new kelondroByteOrder.StringOrder(payloadrow.getOrdering()))); try { - restoreArray(); + heap.restoreArray(indexArrayFile); + for (indexContainer ic : (Iterable) heap.wordContainers(null, false)) { + this.hashDate.setScore(ic.getWordHash(), intTime(ic.lastWrote())); + this.hashScore.setScore(ic.getWordHash(), ic.size()); + } } catch (IOException e){ log.logSevere("unable to restore cache dump: " + e.getMessage(), e); } indexArrayFile.delete(); if (indexArrayFile.exists()) log.logSevere("cannot delete old array file: " + indexArrayFile.toString() + "; please delete manually"); } else if (indexHeapFile.exists()) { - this.cache = null; try { - //indexContainerHeap.indexHeap(this.indexHeapFile, payloadrow, log); // for testing - this.cache = indexContainerHeap.restoreHeap(this.indexHeapFile, payloadrow, log); + heap.restoreHeap(indexHeapFile); + for (indexContainer ic : (Iterable) heap.wordContainers(null, false)) { + this.hashDate.setScore(ic.getWordHash(), intTime(ic.lastWrote())); + this.hashScore.setScore(ic.getWordHash(), ic.size()); + } } catch (IOException e){ log.logSevere("unable to restore cache dump: " + e.getMessage(), e); } - } else { - this.cache = Collections.synchronizedSortedMap(new TreeMap(new kelondroByteOrder.StringOrder(payloadrow.getOrdering()))); } } @@ -119,57 +103,6 @@ public final class indexRAMRI implements indexRI, indexRIReader { return entries.updated(); } - private long restoreArray() throws IOException { - if (!(indexArrayFile.exists())) return 0; - kelondroFixedWidthArray dumpArray; - kelondroBufferedRA readBuffer = null; - dumpArray = new kelondroFixedWidthArray(indexArrayFile, bufferStructureBasis, 0); - log.logInfo("started restore of ram cache '" + indexArrayFile.getName() + "', " + dumpArray.size() + " word/URL relations"); - long startTime = System.currentTimeMillis(); - long messageTime = System.currentTimeMillis() + 5000; - long urlCount = 0, urlsPerSecond = 0; - try { - synchronized (cache) { - Iterator i = dumpArray.contentRows(-1); - String wordHash; - //long creationTime; - indexRWIRowEntry wordEntry; - kelondroRow.EntryIndex row; - while (i.hasNext()) { - // get out one entry - row = i.next(); - if ((row == null) || (row.empty(0)) || (row.empty(3))) continue; - wordHash = row.getColString(0, "UTF-8"); - //creationTime = kelondroRecords.bytes2long(row[2]); - wordEntry = new indexRWIRowEntry(row.getColBytes(3)); - // store to cache - addEntry(wordHash, wordEntry, startTime, false); - urlCount++; - // protect against memory shortage - //while (serverMemory.free() < 1000000) {flushFromMem(); java.lang.System.gc();} - // write a log - if (System.currentTimeMillis() > messageTime) { - serverMemory.gc(1000, "indexRAMRI, for better statistic-2"); // for better statistic - thq - urlsPerSecond = 1 + urlCount * 1000 / (1 + System.currentTimeMillis() - startTime); - log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpArray.size() - urlCount) / urlsPerSecond) + " seconds remaining, free mem = " + (serverMemory.free() / 1024 / 1024) + "MB"); - messageTime = System.currentTimeMillis() + 5000; - } - } - } - if (readBuffer != null) readBuffer.close(); - dumpArray.close(); - dumpArray = null; - log.logInfo("finished restore: " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds"); - } catch (kelondroException e) { - // restore failed - log.logSevere("failed restore of indexCache array dump: " + e.getMessage(), e); - } finally { - if (dumpArray != null) try {dumpArray.close();}catch(Exception e){} - } - return urlCount; - } - - // cache settings public int maxURLinCache() { if (hashScore.size() == 0) return 0; @@ -195,11 +128,11 @@ public final class indexRAMRI implements indexRI, indexRIReader { } public int size() { - return cache.size(); + return heap.size(); } public synchronized int indexSize(String wordHash) { - indexContainer cacheIndex = (indexContainer) cache.get(wordHash); + indexContainer cacheIndex = heap.get(wordHash); if (cacheIndex == null) return 0; return cacheIndex.size(); } @@ -208,56 +141,12 @@ public final class indexRAMRI implements indexRI, indexRIReader { // we return an iterator object that creates top-level-clones of the indexContainers // in the cache, so that manipulations of the iterated objects do not change // objects in the cache. - return new wordContainerIterator(startWordHash, rot); + return heap.wordContainers(startWordHash, rot); } - public class wordContainerIterator implements kelondroCloneableIterator { - - // this class exists, because the wCache cannot be iterated with rotation - // and because every indexContainer Object that is iterated must be returned as top-level-clone - // so this class simulates wCache.tailMap(startWordHash).values().iterator() - // plus the mentioned features - - private boolean rot; - private Iterator iterator; - - public wordContainerIterator(String startWordHash, boolean rot) { - this.rot = rot; - this.iterator = (startWordHash == null) ? cache.values().iterator() : cache.tailMap(startWordHash).values().iterator(); - // The collection's iterator will return the values in the order that their corresponding keys appear in the tree. - } - - public wordContainerIterator clone(Object secondWordHash) { - return new wordContainerIterator((String) secondWordHash, rot); - } - - public boolean hasNext() { - if (rot) return true; - return iterator.hasNext(); - } - - public indexContainer next() { - if (iterator.hasNext()) { - return ((indexContainer) iterator.next()).topLevelClone(); - } else { - // rotation iteration - if (rot) { - iterator = cache.values().iterator(); - return ((indexContainer) iterator.next()).topLevelClone(); - } else { - return null; - } - } - } - - public void remove() { - iterator.remove(); - } - - } public synchronized String maxScoreWordHash() { - if (cache.size() == 0) return null; + if (heap.size() == 0) return null; try { return (String) hashScore.getMaxObject(); } catch (Exception e) { @@ -271,31 +160,34 @@ public final class indexRAMRI implements indexRI, indexRIReader { // we have 2 different methods to find a good hash: // - the oldest entry in the cache // - the entry with maximum count - if (cache.size() == 0) return null; + if (heap.size() == 0) return null; try { - String hash = null; - int count = hashScore.getMaxScore(); - if ((count >= cacheReferenceCountLimit) && - ((hash = (String) hashScore.getMaxObject()) != null)) { - // we MUST flush high-score entries, because a loop deletes entries in cache until this condition fails - // in this cache we MUST NOT check wCacheMinAge - return hash; - } - long oldestTime = longEmit(hashDate.getMinScore()); - if (((System.currentTimeMillis() - oldestTime) > cacheReferenceAgeLimit) && - ((hash = (String) hashDate.getMinObject()) != null)) { - // flush out-dated entries - return hash; - } - // cases with respect to memory situation - if (serverMemory.free() < 100000) { - // urgent low-memory case - hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM) - } else { - // not-efficient-so-far case. cleans up unnecessary cache slots - hash = (String) hashDate.getMinObject(); // flush oldest entries - } + String hash = null; + int count = hashScore.getMaxScore(); + if ((count >= cacheReferenceCountLimit) && + ((hash = (String) hashScore.getMaxObject()) != null)) { + // we MUST flush high-score entries, because a loop deletes entries in cache until this condition fails + // in this cache we MUST NOT check wCacheMinAge + return hash; + } + long oldestTime = longEmit(hashDate.getMinScore()); + if (((System.currentTimeMillis() - oldestTime) > cacheReferenceAgeLimit) && + ((hash = (String) hashDate.getMinObject()) != null)) { + // flush out-dated entries return hash; + } + // cases with respect to memory situation + if (serverMemory.free() < 100000) { + // urgent low-memory case + hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM) + } else { + // not-efficient-so-far case. cleans up unnecessary cache slots + hash = (String) hashDate.getMinObject(); // flush oldest entries + } + if (hash == null) { + heap.wordContainers(null, false).next(); + } + return hash; } catch (Exception e) { log.logSevere("flushFromMem: " + e.getMessage(), e); } @@ -311,11 +203,11 @@ public final class indexRAMRI implements indexRI, indexRIReader { } public boolean hasContainer(String wordHash) { - return cache.containsKey(wordHash); + return heap.has(wordHash); } public int sizeContainer(String wordHash) { - indexContainer c = (indexContainer) cache.get(wordHash); + indexContainer c = heap.get(wordHash); return (c == null) ? 0 : c.size(); } @@ -323,7 +215,7 @@ public final class indexRAMRI implements indexRI, indexRIReader { if (wordHash == null) return null; // retrieve container - indexContainer container = (indexContainer) cache.get(wordHash); + indexContainer container = heap.get(wordHash); // We must not use the container from cache to store everything we find, // as that container remains linked to in the cache and might be changed later @@ -339,22 +231,21 @@ public final class indexRAMRI implements indexRI, indexRIReader { public synchronized indexContainer deleteContainer(String wordHash) { // returns the index that had been deleted - indexContainer container = (indexContainer) cache.remove(wordHash); + indexContainer container = heap.delete(wordHash); hashScore.deleteScore(wordHash); hashDate.deleteScore(wordHash); return container; } public synchronized boolean removeEntry(String wordHash, String urlHash) { - indexContainer c = (indexContainer) cache.get(wordHash); - if ((c != null) && (c.remove(urlHash) != null)) { - // removal successful - if (c.size() == 0) { - deleteContainer(wordHash); - } else { - cache.put(wordHash, c); + boolean removed = heap.removeReference(wordHash, urlHash); + if (removed) { + if (heap.has(wordHash)) { hashScore.decScore(wordHash); hashDate.setScore(wordHash, intTime(System.currentTimeMillis())); + } else { + hashScore.deleteScore(wordHash); + hashDate.deleteScore(wordHash); } return true; } @@ -363,76 +254,36 @@ public final class indexRAMRI implements indexRI, indexRIReader { public synchronized int removeEntries(String wordHash, Set urlHashes) { if (urlHashes.size() == 0) return 0; - indexContainer c = (indexContainer) cache.get(wordHash); - int count; - if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) { + int c = heap.removeReferences(wordHash, urlHashes); + if (c > 0) { // removal successful - if (c.size() == 0) { - deleteContainer(wordHash); - } else { - cache.put(wordHash, c); - hashScore.setScore(wordHash, c.size()); + if (heap.has(wordHash)) { + hashScore.decScore(wordHash); hashDate.setScore(wordHash, intTime(System.currentTimeMillis())); } - return count; + return c; } return 0; } public synchronized int tryRemoveURLs(String urlHash) { - // this tries to delete an index from the cache that has this - // urlHash assigned. This can only work if the entry is really fresh - // Such entries must be searched in the latest entries - int delCount = 0; - Iterator> i = cache.entrySet().iterator(); - Map.Entry entry; - String wordhash; - indexContainer c; - while (i.hasNext()) { - entry = i.next(); - wordhash = entry.getKey(); - - // get container - c = entry.getValue(); - if (c.remove(urlHash) != null) { - if (c.size() == 0) { - i.remove(); - } else { - cache.put(wordhash, c); // superfluous? - } - delCount++; - } - } - return delCount; + return heap.removeReference(urlHash); } public synchronized void addEntries(indexContainer container) { // this puts the entries into the cache, not into the assortment directly - int added = 0; if ((container == null) || (container.size() == 0)) return; // put new words into cache - String wordHash = container.getWordHash(); - indexContainer entries = (indexContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null - if (entries == null) { - entries = container.topLevelClone(); - added = entries.size(); - } else { - added = entries.putAllRecent(container); - } + int added = heap.add(container); if (added > 0) { - cache.put(wordHash, entries); - hashScore.addScore(wordHash, added); - hashDate.setScore(wordHash, intTime(System.currentTimeMillis())); + hashScore.addScore(container.getWordHash(), added); + hashDate.setScore(container.getWordHash(), intTime(System.currentTimeMillis())); } - entries = null; } public synchronized void addEntry(String wordHash, indexRWIRowEntry newEntry, long updateTime, boolean dhtCase) { - indexContainer container = (indexContainer) cache.get(wordHash); - if (container == null) container = new indexContainer(wordHash, this.payloadrow, 1); - container.put(newEntry); - cache.put(wordHash, container); + heap.addEntry(wordHash, newEntry); hashScore.incScore(wordHash); hashDate.setScore(wordHash, intTime(updateTime)); } @@ -440,7 +291,7 @@ public final class indexRAMRI implements indexRI, indexRIReader { public synchronized void close() { // dump cache try { - indexContainerHeap.dumpHeap(this.indexHeapFile, this.payloadrow, this.cache, this.log); + heap.dumpHeap(this.indexHeapFile); } catch (IOException e){ log.logSevere("unable to dump cache: " + e.getMessage(), e); } diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index 0f2ec17a2..600428af0 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -24,6 +24,7 @@ package de.anomic.kelondro; +import java.io.DataInput; import java.io.DataInputStream; import java.io.IOException; import java.util.Date; @@ -72,7 +73,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd } } - public static kelondroRowSet importRowSet(DataInputStream is, kelondroRow rowdef) throws IOException { + public static kelondroRowSet importRowSet(DataInput is, kelondroRow rowdef) throws IOException { byte[] byte2 = new byte[2]; byte[] byte4 = new byte[4]; is.readFully(byte4); int size = (int) kelondroNaturalOrder.decodeLong(byte4);