From a5d28785b12d6d709f22235c19a53294b1e5f009 Mon Sep 17 00:00:00 2001 From: borg-0300 Date: Fri, 2 Nov 2007 14:55:46 +0000 Subject: [PATCH] less OOM (works for me) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4194 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/index/indexRAMRI.java | 11 +- .../kelondro/kelondroCachedRecords.java | 1280 +++++----- .../kelondro/kelondroCollectionIndex.java | 2122 +++++++++-------- .../anomic/plasma/plasmaRankingCRProcess.java | 1124 ++++----- .../de/anomic/plasma/plasmaSwitchboard.java | 45 +- source/de/anomic/server/serverCore.java | 4 +- source/de/anomic/server/serverMemory.java | 27 +- 7 files changed, 2325 insertions(+), 2288 deletions(-) diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index 573f97efd..f0481a1d5 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // @@ -45,6 +45,7 @@ import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroRow; import de.anomic.server.serverByteBuffer; import de.anomic.server.serverFileUtils; +import de.anomic.server.serverMemory; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; @@ -160,7 +161,7 @@ public final class indexRAMRI implements indexRI { // write a log if (System.currentTimeMillis() > messageTime) { - // System.gc(); // for better statistic + serverMemory.gc(1000, "indexRAMRI, for better statistic-1"); // for better statistic - thq wordsPerSecond = wordcount * 1000 / (1 + System.currentTimeMillis() - startTime); log.logInfo("dump status: " + wordcount @@ -222,7 +223,7 @@ public final class indexRAMRI implements indexRI { //while (rt.freeMemory() < 1000000) {flushFromMem(); java.lang.System.gc();} // write a log if (System.currentTimeMillis() > messageTime) { - System.gc(); // for better statistic + serverMemory.gc(1000, "indexRAMRI, for better statistic-2"); // for better statistic - thq urlsPerSecond = 1 + urlCount * 1000 / (1 + System.currentTimeMillis() - startTime); log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpArray.size() - urlCount) / urlsPerSecond) + " seconds remaining, free mem = " + (Runtime.getRuntime().freeMemory() / 1024 / 1024) + "MB"); messageTime = System.currentTimeMillis() + 5000; diff --git a/source/de/anomic/kelondro/kelondroCachedRecords.java b/source/de/anomic/kelondro/kelondroCachedRecords.java index d3c4c73e8..8aa5f3c48 100644 --- a/source/de/anomic/kelondro/kelondroCachedRecords.java +++ b/source/de/anomic/kelondro/kelondroCachedRecords.java @@ -1,638 +1,642 @@ -// kelondroCachedRecords.java -// (C) 2003 - 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 2003 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.kelondro; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.TreeMap; - -import de.anomic.server.serverMemory; - -public class kelondroCachedRecords extends kelondroAbstractRecords implements kelondroRecords { - - // memory calculation - private static final int element_in_cache = 4; // for kelondroCollectionObjectMap: 4; for HashMap: 52 - - // static supervision objects: recognize and coordinate all activites - private static TreeMap recordTracker = new TreeMap(); // a String/filename - kelondroTray mapping - private static long memStopGrow = 10000000; // a limit for the node cache to stop growing if less than this memory amount is available - private static long memStartShrink = 6000000; // a limit for the node cache to start with shrinking if less than this memory amount is available - - // caching buffer - private kelondroIntBytesMap cacheHeaders; // the cache; holds overhead values and key element - private int readHit, readMiss, writeUnique, writeDouble, cacheDelete, cacheFlush; - - - public kelondroCachedRecords( - File file, boolean useNodeCache, long preloadTime, - short ohbytec, short ohhandlec, - kelondroRow rowdef, int FHandles, int txtProps, int txtPropWidth) throws IOException { - super(file, useNodeCache, ohbytec, ohhandlec, rowdef, FHandles, txtProps, txtPropWidth); - initCache(useNodeCache, preloadTime); - if (useNodeCache) recordTracker.put(this.filename, this); - } - - public kelondroCachedRecords( - kelondroRA ra, String filename, boolean useNodeCache, long preloadTime, - short ohbytec, short ohhandlec, - kelondroRow rowdef, int FHandles, int txtProps, int txtPropWidth, - boolean exitOnFail) { - super(ra, filename, useNodeCache, ohbytec, ohhandlec, rowdef, FHandles, txtProps, txtPropWidth, exitOnFail); - initCache(useNodeCache, preloadTime); - if (useNodeCache) recordTracker.put(this.filename, this); - } - - public kelondroCachedRecords( - kelondroRA ra, String filename, boolean useNodeCache, long preloadTime) throws IOException{ - super(ra, filename, useNodeCache); - initCache(useNodeCache, preloadTime); - if (useNodeCache) recordTracker.put(this.filename, this); - } - - private void initCache(boolean useNodeCache, long preloadTime) { - if (useNodeCache) { - this.cacheHeaders = new kelondroIntBytesMap(this.headchunksize, 0); - } else { - this.cacheHeaders = null; - } - this.readHit = 0; - this.readMiss = 0; - this.writeUnique = 0; - this.writeDouble = 0; - this.cacheDelete = 0; - this.cacheFlush = 0; - // pre-load node cache - if ((preloadTime > 0) && (useNodeCache)) { - long stop = System.currentTimeMillis() + preloadTime; - int count = 0; - try { - Iterator i = contentNodes(preloadTime); - CacheNode n; - while ((System.currentTimeMillis() < stop) && (cacheGrowStatus() == 2) && (i.hasNext())) { - n = (CacheNode) i.next(); - cacheHeaders.addb(n.handle().index, n.headChunk); - count++; - } - cacheHeaders.flush(); - logFine("preloaded " + count + " records into cache"); - } catch (kelondroException e) { - // the contentNodes iterator had a time-out; we don't do a preload - logFine("could not preload records: " + e.getMessage()); - } - - } - } - - private int cacheGrowStatus() { - long available = serverMemory.available(); - if ((cacheHeaders != null) && (available < cacheHeaders.memoryNeededForGrow())) return 0; - return cacheGrowStatus(available, memStopGrow, memStartShrink); - } - - public static final int cacheGrowStatus(long available, long stopGrow, long startShrink) { - // returns either 0, 1 or 2: - // 0: cache is not allowed to grow, but shall shrink - // 1: cache is allowed to grow, but need not to shrink - // 2: cache is allowed to grow and must not shrink - if (available > stopGrow) return 2; - if (available > startShrink) return 1; - return 0; - } - - public static void setCacheGrowStati(long memStopGrowNew, long memStartShrinkNew) { - memStopGrow = memStopGrowNew; - memStartShrink = memStartShrinkNew; - } - - public static long getMemStopGrow() { - return memStopGrow ; - } - - public static long getMemStartShrink() { - return memStartShrink ; - } - - public static final Iterator filenames() { - // iterates string objects; all file names from record tracker - return recordTracker.keySet().iterator(); - } - - public static final Map memoryStats(String filename) { - // returns a map for each file in the tracker; - // the map represents properties for each record oobjects, - // i.e. for cache memory allocation - kelondroCachedRecords theRecord = (kelondroCachedRecords) recordTracker.get(filename); - return theRecord.memoryStats(); - } - - private final Map memoryStats() { - // returns statistical data about this object - if (cacheHeaders == null) return null; - HashMap map = new HashMap(); - map.put("nodeChunkSize", Integer.toString(this.headchunksize + element_in_cache)); - map.put("nodeCacheCount", Integer.toString(cacheHeaders.size())); - map.put("nodeCacheMem", Integer.toString(cacheHeaders.size() * (this.headchunksize + element_in_cache))); - map.put("nodeCacheReadHit", Integer.toString(readHit)); - map.put("nodeCacheReadMiss", Integer.toString(readMiss)); - map.put("nodeCacheWriteUnique", Integer.toString(writeUnique)); - map.put("nodeCacheWriteDouble", Integer.toString(writeDouble)); - map.put("nodeCacheDeletes", Integer.toString(cacheDelete)); - map.put("nodeCacheFlushes", Integer.toString(cacheFlush)); - return map; - } - - protected synchronized void deleteNode(kelondroHandle handle) throws IOException { - if (cacheHeaders == null) { - super.deleteNode(handle); - } else synchronized (cacheHeaders) { - if (cacheHeaders.size() == 0) { - super.deleteNode(handle); - } else { - cacheHeaders.removeb(handle.index); - cacheDelete++; - super.deleteNode(handle); - } - } - } - - protected void printCache() { - if (cacheHeaders == null) { - System.out.println("### file report: " + size() + " entries"); - for (int i = 0; i < USAGE.allCount(); i++) { - // print from file to compare - System.out.print("#F " + i + ": "); - try { - for (int j = 0; j < headchunksize; j++) - System.out.print(Integer.toHexString(0xff & entryFile.readByte(j + seekpos(new kelondroHandle(i)))) + " "); - } catch (IOException e) {} - - System.out.println(); - } - } else { - System.out.println("### cache report: " + cacheHeaders.size() + " entries"); - - Iterator i = cacheHeaders.rows(); - kelondroRow.Entry entry; - while (i.hasNext()) { - entry = (kelondroRow.Entry) i.next(); - - // print from cache - System.out.print("#C "); - printChunk(entry); - System.out.println(); - - // print from file to compare - /* - System.out.print("#F " + cp + " " + ((Handle) entry.getKey()).index + ": "); - try { - for (int j = 0; j < headchunksize; j++) - System.out.print(entryFile.readByte(j + seekpos((Handle) entry.getKey())) + ","); - } catch (IOException e) {} - */ - System.out.println(); - } - } - System.out.println("### end report"); - } - - public synchronized void close() { - if (cacheHeaders == null) { - if (recordTracker.get(this.filename) != null) { - theLogger.severe("close(): file '" + this.filename + "' was tracked with record tracker, but it should not."); - } - } else { - if (recordTracker.remove(this.filename) == null) { - theLogger.severe("close(): file '" + this.filename + "' was not tracked with record tracker."); - } - } - super.close(); - this.cacheHeaders = null; - } - - public kelondroProfile[] profiles() { - return new kelondroProfile[]{ - (cacheHeaders == null) ? new kelondroProfile() : - cacheHeaders.profile(), - entryFile.profile() - }; - } - - public kelondroProfile profile() { - return kelondroProfile.consolidate(profiles()); - } - - public void print() throws IOException { - super.print(); - - // print also all records - System.out.println("CACHE"); - printCache(); - System.out.println("--"); - System.out.println("NODES"); - Iterator i = new contentNodeIterator(-1); - kelondroNode n; - while (i.hasNext()) { - n = (kelondroNode) i.next(); - System.out.println("NODE: " + n.toString()); - } - } - - public kelondroNode newNode(kelondroHandle handle, byte[] bulk, int offset) throws IOException { - return new CacheNode(handle, bulk, offset); - } - - public final class CacheNode implements kelondroNode { - // an Node holds all information of one row of data. This includes the key to the entry - // which is stored as entry element at position 0 - // an Node object can be created in two ways: - // 1. instantiation with an index number. After creation the Object does not hold any - // value information until such is retrieved using the getValue() method - // 2. instantiation with a value array. the values are not directly written into the - // file. Expanding the tree structure is then done using the save() method. at any - // time it is possible to verify the save state using the saved() predicate. - // Therefore an entry object has three modes: - // a: holding an index information only (saved() = true) - // b: holding value information only (saved() = false) - // c: holding index and value information at the same time (saved() = true) - // which can be the result of one of the two processes as follow: - // (i) created with index and after using the getValue() method, or - // (ii) created with values and after calling the save() method - // the method will therefore throw an IllegalStateException when the following - // process step is performed: - // - create the Node with index and call then the save() method - // this case can be decided with - // ((index != NUL) && (values == null)) - // The save() method represents the insert function for the tree. Balancing functions - // are applied automatically. While balancing, the Node does never change its index key, - // but its parent/child keys. - //private byte[] ohBytes = null; // the overhead bytes, OHBYTEC values - //private Handle[] ohHandle= null; // the overhead handles, OHHANDLEC values - //private byte[][] values = null; // an array of byte[] nodes is the value vector - private kelondroHandle handle = null; // index of the entry, by default NUL means undefined - private byte[] headChunk = null; // contains ohBytes, ohHandles and the key value - private byte[] tailChunk = null; // contains all values except the key value - private boolean headChanged = false; - private boolean tailChanged = false; - - public CacheNode(byte[] rowinstance) throws IOException { - // this initializer is used to create nodes from bulk-read byte arrays - assert ((rowinstance == null) || (rowinstance.length == ROW.objectsize)) : "bulkchunk.length = " + rowinstance.length + ", ROW.width(0) = " + ROW.width(0); - this.handle = new kelondroHandle(USAGE.allocatePayload(rowinstance)); - - // create empty chunks - this.headChunk = new byte[headchunksize]; - this.tailChunk = new byte[tailchunksize]; - - // write content to chunks - if (rowinstance == null) { - for (int i = headchunksize - 1; i >= 0; i--) this.headChunk[i] = (byte) 0xff; - for (int i = tailchunksize - 1; i >= 0; i--) this.tailChunk[i] = (byte) 0xff; - } else { - for (int i = overhead - 1; i >= 0; i--) this.headChunk[i] = (byte) 0xff; - System.arraycopy(rowinstance, 0, this.headChunk, overhead, ROW.width(0)); - System.arraycopy(rowinstance, ROW.width(0), this.tailChunk, 0, tailchunksize); - } - - if (cacheHeaders != null) synchronized (cacheHeaders) { - updateNodeCache(); - } - - // mark chunks as changed - // if the head/tail chunks come from a file system read, setChanged should be false - // if the chunks come from a overwrite attempt, it should be true - this.headChanged = false; // we wrote the head already during allocate - this.tailChanged = false; // we write the tail already during allocate - } - - public CacheNode(kelondroHandle handle, byte[] bulkchunk, int offset) throws IOException { - // this initializer is used to create nodes from bulk-read byte arrays - // if write is true, then the chunk in bulkchunk is written to the file - // othervise it is considered equal to what is stored in the file - // (that is ensured during pre-loaded enumeration) - this.handle = handle; - boolean changed; - if (handle.index >= USAGE.allCount()) { - // this causes only a write action if we create a node beyond the end of the file - USAGE.allocateRecord(handle.index, bulkchunk, offset); - changed = false; // we have already wrote the record, so it is considered as unchanged - } else { - changed = true; - } - assert ((bulkchunk == null) || (bulkchunk.length - offset >= recordsize)) : "bulkchunk.length = " + bulkchunk.length + ", offset = " + offset + ", recordsize = " + recordsize; - - // create empty chunks - this.headChunk = new byte[headchunksize]; - this.tailChunk = new byte[tailchunksize]; - - // write content to chunks - if (bulkchunk != null) { - System.arraycopy(bulkchunk, offset, this.headChunk, 0, headchunksize); - System.arraycopy(bulkchunk, offset + headchunksize, this.tailChunk, 0, tailchunksize); - } - - // mark chunks as changed - this.headChanged = changed; - this.tailChanged = changed; - } - - public CacheNode(kelondroHandle handle, boolean fillTail) throws IOException { - this(handle, null, 0, fillTail); - } - - public CacheNode(kelondroHandle handle, CacheNode parentNode, int referenceInParent, boolean fillTail) throws IOException { - // this creates an entry with an pre-reserved entry position. - // values can be written using the setValues() method, - // but we expect that values are already there in the file. - assert (handle != null): "node handle is null"; - assert (handle.index >= 0): "node handle too low: " + handle.index; - //assert (handle.index < USAGE.allCount()) : "node handle too high: " + handle.index + ", USEDC=" + USAGE.USEDC + ", FREEC=" + USAGE.FREEC; - - // the parentNode can be given if an auto-fix in the following case is wanted - if (handle == null) throw new kelondroException(filename, "INTERNAL ERROR: node handle is null."); - if (handle.index >= USAGE.allCount()) { - if (parentNode == null) throw new kelondroException(filename, "INTERNAL ERROR, Node/init: node handle index " + handle.index + " exceeds size. No auto-fix node was submitted. This is a serious failure."); - try { - parentNode.setOHHandle(referenceInParent, null); - parentNode.commit(); - logWarning("INTERNAL ERROR, Node/init in " + filename + ": node handle index " + handle.index + " exceeds size. The bad node has been auto-fixed"); - } catch (IOException ee) { - throw new kelondroException(filename, "INTERNAL ERROR, Node/init: node handle index " + handle.index + " exceeds size. It was tried to fix the bad node, but failed with an IOException: " + ee.getMessage()); - } - } - - // use given handle - this.handle = new kelondroHandle(handle.index); - - // check for memory availability when fillTail is requested - if ((fillTail) && (tailchunksize > 10000)) fillTail = false; // this is a fail-safe 'short version' of a memory check - - // init the content - // create chunks; read them from file or cache - this.tailChunk = null; - if (cacheHeaders == null) { - if (fillTail) { - // read complete record - byte[] chunkbuffer = new byte[recordsize]; - entryFile.readFully(seekpos(this.handle), chunkbuffer, 0, recordsize); - this.headChunk = new byte[headchunksize]; - this.tailChunk = new byte[tailchunksize]; - System.arraycopy(chunkbuffer, 0, this.headChunk, 0, headchunksize); - System.arraycopy(chunkbuffer, headchunksize, this.tailChunk, 0, tailchunksize); - chunkbuffer = null; - } else { - // read overhead and key - this.headChunk = new byte[headchunksize]; - this.tailChunk = null; - entryFile.readFully(seekpos(this.handle), this.headChunk, 0, headchunksize); - } - } else synchronized(cacheHeaders) { - byte[] cacheEntry = null; - cacheEntry = cacheHeaders.getb(this.handle.index); - if (cacheEntry == null) { - // cache miss, we read overhead and key from file - readMiss++; - if (fillTail) { - // read complete record - byte[] chunkbuffer = new byte[recordsize]; - entryFile.readFully(seekpos(this.handle), chunkbuffer, 0, recordsize); - this.headChunk = new byte[headchunksize]; - this.tailChunk = new byte[tailchunksize]; - System.arraycopy(chunkbuffer, 0, this.headChunk, 0, headchunksize); - System.arraycopy(chunkbuffer, headchunksize, this.tailChunk, 0, tailchunksize); - chunkbuffer = null; - } else { - // read overhead and key - this.headChunk = new byte[headchunksize]; - this.tailChunk = null; - entryFile.readFully(seekpos(this.handle), this.headChunk, 0, headchunksize); - } - - // if space left in cache, copy these value to the cache - updateNodeCache(); - } else { - readHit++; - this.headChunk = cacheEntry; - } - } - } - - private void setValue(byte[] value, int valueoffset, int valuewidth, byte[] targetarray, int targetoffset) { - if (value == null) { - while (valuewidth-- > 0) targetarray[targetoffset++] = 0; - } else { - assert ((valueoffset >= 0) && (valueoffset < value.length)) : "valueoffset = " + valueoffset; - assert ((valueoffset + valuewidth <= value.length)) : "valueoffset = " + valueoffset + ", valuewidth = " + valuewidth + ", value.length = " + value.length; - assert ((targetoffset >= 0) && (targetoffset < targetarray.length)) : "targetoffset = " + targetoffset; - assert ((targetoffset + valuewidth <= targetarray.length)) : "targetoffset = " + targetoffset + ", valuewidth = " + valuewidth + ", targetarray.length = " + targetarray.length; - System.arraycopy(value, valueoffset, targetarray, targetoffset, Math.min(value.length, valuewidth)); // error? - while (valuewidth-- > value.length) targetarray[targetoffset + valuewidth] = 0; - } - } - - public kelondroHandle handle() { - // if this entry has an index, return it - if (this.handle.index == kelondroHandle.NUL) throw new kelondroException(filename, "the entry has no index assigned"); - return this.handle; - } - - public void setOHByte(int i, byte b) { - if (i >= OHBYTEC) throw new IllegalArgumentException("setOHByte: wrong index " + i); - if (this.handle.index == kelondroHandle.NUL) throw new kelondroException(filename, "setOHByte: no handle assigned"); - this.headChunk[i] = b; - this.headChanged = true; - } - - public void setOHHandle(int i, kelondroHandle otherhandle) { - assert (i < OHHANDLEC): "setOHHandle: wrong array size " + i; - assert (this.handle.index != kelondroHandle.NUL): "setOHHandle: no handle assigned ind file" + filename; - if (otherhandle == null) { - NUL2bytes(this.headChunk, OHBYTEC + 4 * i); - } else { - if (otherhandle.index >= USAGE.allCount()) throw new kelondroException(filename, "INTERNAL ERROR, setOHHandles: handle " + i + " exceeds file size (" + handle.index + " >= " + USAGE.allCount() + ")"); - int2bytes(otherhandle.index, this.headChunk, OHBYTEC + 4 * i); - } - this.headChanged = true; - } - - public byte getOHByte(int i) { - if (i >= OHBYTEC) throw new IllegalArgumentException("getOHByte: wrong index " + i); - if (this.handle.index == kelondroHandle.NUL) throw new kelondroException(filename, "Cannot load OH values"); - return this.headChunk[i]; - } - - public kelondroHandle getOHHandle(int i) { - if (this.handle.index == kelondroHandle.NUL) throw new kelondroException(filename, "Cannot load OH values"); - assert (i < OHHANDLEC): "handle index out of bounds: " + i + " in file " + filename; - int h = bytes2int(this.headChunk, OHBYTEC + 4 * i); - return (h == kelondroHandle.NUL) ? null : new kelondroHandle(h); - } - - public synchronized void setValueRow(byte[] row) throws IOException { - // if the index is defined, then write values directly to the file, else only to the object - if ((row != null) && (row.length != ROW.objectsize())) throw new IOException("setValueRow with wrong (" + row.length + ") row length instead correct: " + ROW.objectsize()); - - // set values - if (this.handle.index != kelondroHandle.NUL) { - setValue(row, 0, ROW.width(0), headChunk, overhead); - if (ROW.columns() > 1) setValue(row, ROW.width(0), tailchunksize, tailChunk, 0); - } - this.headChanged = true; - this.tailChanged = true; - } - - public synchronized boolean valid() { - // returns true if the key starts with non-zero byte - // this may help to detect deleted entries - return (headChunk[overhead] != 0) && ((headChunk[overhead] != -128) || (headChunk[overhead + 1] != 0)); - } - - public synchronized byte[] getKey() { - // read key - return trimCopy(headChunk, overhead, ROW.width(0)); - } - - public synchronized byte[] getValueRow() throws IOException { - - if (this.tailChunk == null) { - // load all values from the database file - this.tailChunk = new byte[tailchunksize]; - // read values - entryFile.readFully(seekpos(this.handle) + (long) headchunksize, this.tailChunk, 0, this.tailChunk.length); - } - - // create return value - byte[] row = new byte[ROW.objectsize()]; - - // read key - System.arraycopy(headChunk, overhead, row, 0, ROW.width(0)); - - // read remaining values - System.arraycopy(tailChunk, 0, row, ROW.width(0), tailchunksize); - - return row; - } - - public synchronized void commit() throws IOException { - // this must be called after all write operations to the node are - // finished - - // place the data to the file - - if (this.headChunk == null) { - // there is nothing to save - throw new kelondroException(filename, "no values to save (header missing)"); - } - - boolean doCommit = this.headChanged || this.tailChanged; - - // save head - synchronized (entryFile) { - if (this.headChanged) { - //System.out.println("WRITEH(" + filename + ", " + seekpos(this.handle) + ", " + this.headChunk.length + ")"); - assert (headChunk == null) || (headChunk.length == headchunksize); - entryFile.write(seekpos(this.handle), (this.headChunk == null) ? new byte[headchunksize] : this.headChunk); - updateNodeCache(); - this.headChanged = false; - } - - // save tail - if ((this.tailChunk != null) && (this.tailChanged)) { - //System.out.println("WRITET(" + filename + ", " + (seekpos(this.handle) + headchunksize) + ", " + this.tailChunk.length + ")"); - assert (tailChunk == null) || (tailChunk.length == tailchunksize); - entryFile.write(seekpos(this.handle) + headchunksize, (this.tailChunk == null) ? new byte[tailchunksize] : this.tailChunk); - this.tailChanged = false; - } - - if (doCommit) entryFile.commit(); - } - } - - public String toString() { - if (this.handle.index == kelondroHandle.NUL) return "NULL"; - String s = Integer.toHexString(this.handle.index); - kelondroHandle h; - while (s.length() < 4) s = "0" + s; - try { - for (int i = 0; i < OHBYTEC; i++) s = s + ":b" + getOHByte(i); - for (int i = 0; i < OHHANDLEC; i++) { - h = getOHHandle(i); - if (h == null) s = s + ":hNULL"; else s = s + ":h" + h.toString(); - } - kelondroRow.Entry content = row().newEntry(getValueRow()); - for (int i = 0; i < row().columns(); i++) s = s + ":" + ((content.empty(i)) ? "NULL" : content.getColString(i, "UTF-8").trim()); - } catch (IOException e) { - s = s + ":***LOAD ERROR***:" + e.getMessage(); - } - return s; - } - - private boolean cacheSpace() { - // check for space in cache - // should be only called within a synchronized(cacheHeaders) environment - // returns true if it is allowed to add another entry to the cache - // returns false if the cache is considered to be full - if (cacheHeaders == null) return false; // no caching - if (cacheHeaders.size() == 0) return true; // nothing there to flush - if (cacheGrowStatus() == 2) return true; // no need to flush cache space - - // just delete any of the entries - if (cacheGrowStatus() <= 1) synchronized (cacheHeaders) { - cacheHeaders.removeoneb(); - cacheFlush++; - } - return cacheGrowStatus() > 0; - } - - private void updateNodeCache() { - if (this.handle == null) return; // wrong access - if (this.headChunk == null) return; // nothing there to cache - if (cacheHeaders == null) return; // we do not use the cache - if (cacheSpace()) synchronized (cacheHeaders) { - // generate cache entry - //byte[] cacheEntry = new byte[headchunksize]; - //System.arraycopy(headChunk, 0, cacheEntry, 0, headchunksize); - - // store the cache entry - boolean upd = false; - upd = (cacheHeaders.putb(this.handle.index, headChunk) != null); - if (upd) writeDouble++; else writeUnique++; - - //System.out.println("kelondroRecords cache4" + filename + ": cache record size = " + (memBefore - Runtime.getRuntime().freeMemory()) + " bytes" + ((newentry) ? " new" : "")); - //printCache(); - } else { - // there shall be no entry in the cache. If one exists, we remove it - boolean rem = false; - rem = (cacheHeaders.removeb(this.handle.index) != null); - if (rem) cacheDelete++; - } - } - } - -} +// kelondroCachedRecords.java +// (C) 2003 - 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 2003 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +import de.anomic.server.serverMemory; + +public class kelondroCachedRecords extends kelondroAbstractRecords implements kelondroRecords { + + // memory calculation + private static final int element_in_cache = 4; // for kelondroCollectionObjectMap: 4; for HashMap: 52 + + // static supervision objects: recognize and coordinate all activites + private static TreeMap recordTracker = new TreeMap(); // a String/filename - kelondroTray mapping + private static long memStopGrow = 10000000; // a limit for the node cache to stop growing if less than this memory amount is available + private static long memStartShrink = 6000000; // a limit for the node cache to start with shrinking if less than this memory amount is available + + // caching buffer + private kelondroIntBytesMap cacheHeaders; // the cache; holds overhead values and key element + private int readHit, readMiss, writeUnique, writeDouble, cacheDelete, cacheFlush; + + + public kelondroCachedRecords( + File file, boolean useNodeCache, long preloadTime, + short ohbytec, short ohhandlec, + kelondroRow rowdef, int FHandles, int txtProps, int txtPropWidth) throws IOException { + super(file, useNodeCache, ohbytec, ohhandlec, rowdef, FHandles, txtProps, txtPropWidth); + initCache(useNodeCache, preloadTime); + if (useNodeCache) recordTracker.put(this.filename, this); + } + + public kelondroCachedRecords( + kelondroRA ra, String filename, boolean useNodeCache, long preloadTime, + short ohbytec, short ohhandlec, + kelondroRow rowdef, int FHandles, int txtProps, int txtPropWidth, + boolean exitOnFail) { + super(ra, filename, useNodeCache, ohbytec, ohhandlec, rowdef, FHandles, txtProps, txtPropWidth, exitOnFail); + initCache(useNodeCache, preloadTime); + if (useNodeCache) recordTracker.put(this.filename, this); + } + + public kelondroCachedRecords( + kelondroRA ra, String filename, boolean useNodeCache, long preloadTime) throws IOException{ + super(ra, filename, useNodeCache); + initCache(useNodeCache, preloadTime); + if (useNodeCache) recordTracker.put(this.filename, this); + } + + private void initCache(boolean useNodeCache, long preloadTime) { + if (useNodeCache) { + this.cacheHeaders = new kelondroIntBytesMap(this.headchunksize, 0); + } else { + this.cacheHeaders = null; + } + this.readHit = 0; + this.readMiss = 0; + this.writeUnique = 0; + this.writeDouble = 0; + this.cacheDelete = 0; + this.cacheFlush = 0; + // pre-load node cache + if ((preloadTime > 0) && (useNodeCache)) { + long stop = System.currentTimeMillis() + preloadTime; + int count = 0; + try { + Iterator i = contentNodes(preloadTime); + CacheNode n; + while ((System.currentTimeMillis() < stop) && (cacheGrowStatus() == 2) && (i.hasNext())) { + n = (CacheNode) i.next(); + cacheHeaders.addb(n.handle().index, n.headChunk); + count++; + } + cacheHeaders.flush(); + logFine("preloaded " + count + " records into cache"); + } catch (kelondroException e) { + // the contentNodes iterator had a time-out; we don't do a preload + logFine("could not preload records: " + e.getMessage()); + } + + } + } + + private int cacheGrowStatus() { + long available = serverMemory.available(); + if ((cacheHeaders != null) && (available < cacheHeaders.memoryNeededForGrow())) return 0; + return cacheGrowStatus(available, memStopGrow, memStartShrink); + } + + public static final int cacheGrowStatus(long available, long stopGrow, long startShrink) { + // returns either 0, 1 or 2: + // 0: cache is not allowed to grow, but shall shrink + // 1: cache is allowed to grow, but need not to shrink + // 2: cache is allowed to grow and must not shrink + if (available > stopGrow) return 2; + if (available > startShrink) { + serverMemory.gc(30000, "kelendroCacheRecords.cacheGrowStatus(...) 1"); // thq + return 1; + } + serverMemory.gc(3000, "kelendroCacheRecords.cacheGrowStatus(...) 0"); // thq + return 0; + } + + public static void setCacheGrowStati(long memStopGrowNew, long memStartShrinkNew) { + memStopGrow = memStopGrowNew; + memStartShrink = memStartShrinkNew; + } + + public static long getMemStopGrow() { + return memStopGrow ; + } + + public static long getMemStartShrink() { + return memStartShrink ; + } + + public static final Iterator filenames() { + // iterates string objects; all file names from record tracker + return recordTracker.keySet().iterator(); + } + + public static final Map memoryStats(String filename) { + // returns a map for each file in the tracker; + // the map represents properties for each record oobjects, + // i.e. for cache memory allocation + kelondroCachedRecords theRecord = (kelondroCachedRecords) recordTracker.get(filename); + return theRecord.memoryStats(); + } + + private final Map memoryStats() { + // returns statistical data about this object + if (cacheHeaders == null) return null; + HashMap map = new HashMap(); + map.put("nodeChunkSize", Integer.toString(this.headchunksize + element_in_cache)); + map.put("nodeCacheCount", Integer.toString(cacheHeaders.size())); + map.put("nodeCacheMem", Integer.toString(cacheHeaders.size() * (this.headchunksize + element_in_cache))); + map.put("nodeCacheReadHit", Integer.toString(readHit)); + map.put("nodeCacheReadMiss", Integer.toString(readMiss)); + map.put("nodeCacheWriteUnique", Integer.toString(writeUnique)); + map.put("nodeCacheWriteDouble", Integer.toString(writeDouble)); + map.put("nodeCacheDeletes", Integer.toString(cacheDelete)); + map.put("nodeCacheFlushes", Integer.toString(cacheFlush)); + return map; + } + + protected synchronized void deleteNode(kelondroHandle handle) throws IOException { + if (cacheHeaders == null) { + super.deleteNode(handle); + } else synchronized (cacheHeaders) { + if (cacheHeaders.size() == 0) { + super.deleteNode(handle); + } else { + cacheHeaders.removeb(handle.index); + cacheDelete++; + super.deleteNode(handle); + } + } + } + + protected void printCache() { + if (cacheHeaders == null) { + System.out.println("### file report: " + size() + " entries"); + for (int i = 0; i < USAGE.allCount(); i++) { + // print from file to compare + System.out.print("#F " + i + ": "); + try { + for (int j = 0; j < headchunksize; j++) + System.out.print(Integer.toHexString(0xff & entryFile.readByte(j + seekpos(new kelondroHandle(i)))) + " "); + } catch (IOException e) {} + + System.out.println(); + } + } else { + System.out.println("### cache report: " + cacheHeaders.size() + " entries"); + + Iterator i = cacheHeaders.rows(); + kelondroRow.Entry entry; + while (i.hasNext()) { + entry = (kelondroRow.Entry) i.next(); + + // print from cache + System.out.print("#C "); + printChunk(entry); + System.out.println(); + + // print from file to compare + /* + System.out.print("#F " + cp + " " + ((Handle) entry.getKey()).index + ": "); + try { + for (int j = 0; j < headchunksize; j++) + System.out.print(entryFile.readByte(j + seekpos((Handle) entry.getKey())) + ","); + } catch (IOException e) {} + */ + System.out.println(); + } + } + System.out.println("### end report"); + } + + public synchronized void close() { + if (cacheHeaders == null) { + if (recordTracker.get(this.filename) != null) { + theLogger.severe("close(): file '" + this.filename + "' was tracked with record tracker, but it should not."); + } + } else { + if (recordTracker.remove(this.filename) == null) { + theLogger.severe("close(): file '" + this.filename + "' was not tracked with record tracker."); + } + } + super.close(); + this.cacheHeaders = null; + } + + public kelondroProfile[] profiles() { + return new kelondroProfile[]{ + (cacheHeaders == null) ? new kelondroProfile() : + cacheHeaders.profile(), + entryFile.profile() + }; + } + + public kelondroProfile profile() { + return kelondroProfile.consolidate(profiles()); + } + + public void print() throws IOException { + super.print(); + + // print also all records + System.out.println("CACHE"); + printCache(); + System.out.println("--"); + System.out.println("NODES"); + Iterator i = new contentNodeIterator(-1); + kelondroNode n; + while (i.hasNext()) { + n = (kelondroNode) i.next(); + System.out.println("NODE: " + n.toString()); + } + } + + public kelondroNode newNode(kelondroHandle handle, byte[] bulk, int offset) throws IOException { + return new CacheNode(handle, bulk, offset); + } + + public final class CacheNode implements kelondroNode { + // an Node holds all information of one row of data. This includes the key to the entry + // which is stored as entry element at position 0 + // an Node object can be created in two ways: + // 1. instantiation with an index number. After creation the Object does not hold any + // value information until such is retrieved using the getValue() method + // 2. instantiation with a value array. the values are not directly written into the + // file. Expanding the tree structure is then done using the save() method. at any + // time it is possible to verify the save state using the saved() predicate. + // Therefore an entry object has three modes: + // a: holding an index information only (saved() = true) + // b: holding value information only (saved() = false) + // c: holding index and value information at the same time (saved() = true) + // which can be the result of one of the two processes as follow: + // (i) created with index and after using the getValue() method, or + // (ii) created with values and after calling the save() method + // the method will therefore throw an IllegalStateException when the following + // process step is performed: + // - create the Node with index and call then the save() method + // this case can be decided with + // ((index != NUL) && (values == null)) + // The save() method represents the insert function for the tree. Balancing functions + // are applied automatically. While balancing, the Node does never change its index key, + // but its parent/child keys. + //private byte[] ohBytes = null; // the overhead bytes, OHBYTEC values + //private Handle[] ohHandle= null; // the overhead handles, OHHANDLEC values + //private byte[][] values = null; // an array of byte[] nodes is the value vector + private kelondroHandle handle = null; // index of the entry, by default NUL means undefined + private byte[] headChunk = null; // contains ohBytes, ohHandles and the key value + private byte[] tailChunk = null; // contains all values except the key value + private boolean headChanged = false; + private boolean tailChanged = false; + + public CacheNode(byte[] rowinstance) throws IOException { + // this initializer is used to create nodes from bulk-read byte arrays + assert ((rowinstance == null) || (rowinstance.length == ROW.objectsize)) : "bulkchunk.length = " + rowinstance.length + ", ROW.width(0) = " + ROW.width(0); + this.handle = new kelondroHandle(USAGE.allocatePayload(rowinstance)); + + // create empty chunks + this.headChunk = new byte[headchunksize]; + this.tailChunk = new byte[tailchunksize]; + + // write content to chunks + if (rowinstance == null) { + for (int i = headchunksize - 1; i >= 0; i--) this.headChunk[i] = (byte) 0xff; + for (int i = tailchunksize - 1; i >= 0; i--) this.tailChunk[i] = (byte) 0xff; + } else { + for (int i = overhead - 1; i >= 0; i--) this.headChunk[i] = (byte) 0xff; + System.arraycopy(rowinstance, 0, this.headChunk, overhead, ROW.width(0)); + System.arraycopy(rowinstance, ROW.width(0), this.tailChunk, 0, tailchunksize); + } + + if (cacheHeaders != null) synchronized (cacheHeaders) { + updateNodeCache(); + } + + // mark chunks as changed + // if the head/tail chunks come from a file system read, setChanged should be false + // if the chunks come from a overwrite attempt, it should be true + this.headChanged = false; // we wrote the head already during allocate + this.tailChanged = false; // we write the tail already during allocate + } + + public CacheNode(kelondroHandle handle, byte[] bulkchunk, int offset) throws IOException { + // this initializer is used to create nodes from bulk-read byte arrays + // if write is true, then the chunk in bulkchunk is written to the file + // othervise it is considered equal to what is stored in the file + // (that is ensured during pre-loaded enumeration) + this.handle = handle; + boolean changed; + if (handle.index >= USAGE.allCount()) { + // this causes only a write action if we create a node beyond the end of the file + USAGE.allocateRecord(handle.index, bulkchunk, offset); + changed = false; // we have already wrote the record, so it is considered as unchanged + } else { + changed = true; + } + assert ((bulkchunk == null) || (bulkchunk.length - offset >= recordsize)) : "bulkchunk.length = " + bulkchunk.length + ", offset = " + offset + ", recordsize = " + recordsize; + + // create empty chunks + this.headChunk = new byte[headchunksize]; + this.tailChunk = new byte[tailchunksize]; + + // write content to chunks + if (bulkchunk != null) { + System.arraycopy(bulkchunk, offset, this.headChunk, 0, headchunksize); + System.arraycopy(bulkchunk, offset + headchunksize, this.tailChunk, 0, tailchunksize); + } + + // mark chunks as changed + this.headChanged = changed; + this.tailChanged = changed; + } + + public CacheNode(kelondroHandle handle, boolean fillTail) throws IOException { + this(handle, null, 0, fillTail); + } + + public CacheNode(kelondroHandle handle, CacheNode parentNode, int referenceInParent, boolean fillTail) throws IOException { + // this creates an entry with an pre-reserved entry position. + // values can be written using the setValues() method, + // but we expect that values are already there in the file. + assert (handle != null): "node handle is null"; + assert (handle.index >= 0): "node handle too low: " + handle.index; + //assert (handle.index < USAGE.allCount()) : "node handle too high: " + handle.index + ", USEDC=" + USAGE.USEDC + ", FREEC=" + USAGE.FREEC; + + // the parentNode can be given if an auto-fix in the following case is wanted + if (handle == null) throw new kelondroException(filename, "INTERNAL ERROR: node handle is null."); + if (handle.index >= USAGE.allCount()) { + if (parentNode == null) throw new kelondroException(filename, "INTERNAL ERROR, Node/init: node handle index " + handle.index + " exceeds size. No auto-fix node was submitted. This is a serious failure."); + try { + parentNode.setOHHandle(referenceInParent, null); + parentNode.commit(); + logWarning("INTERNAL ERROR, Node/init in " + filename + ": node handle index " + handle.index + " exceeds size. The bad node has been auto-fixed"); + } catch (IOException ee) { + throw new kelondroException(filename, "INTERNAL ERROR, Node/init: node handle index " + handle.index + " exceeds size. It was tried to fix the bad node, but failed with an IOException: " + ee.getMessage()); + } + } + + // use given handle + this.handle = new kelondroHandle(handle.index); + + // check for memory availability when fillTail is requested + if ((fillTail) && (tailchunksize > 10000)) fillTail = false; // this is a fail-safe 'short version' of a memory check + + // init the content + // create chunks; read them from file or cache + this.tailChunk = null; + if (cacheHeaders == null) { + if (fillTail) { + // read complete record + byte[] chunkbuffer = new byte[recordsize]; + entryFile.readFully(seekpos(this.handle), chunkbuffer, 0, recordsize); + this.headChunk = new byte[headchunksize]; + this.tailChunk = new byte[tailchunksize]; + System.arraycopy(chunkbuffer, 0, this.headChunk, 0, headchunksize); + System.arraycopy(chunkbuffer, headchunksize, this.tailChunk, 0, tailchunksize); + chunkbuffer = null; + } else { + // read overhead and key + this.headChunk = new byte[headchunksize]; + this.tailChunk = null; + entryFile.readFully(seekpos(this.handle), this.headChunk, 0, headchunksize); + } + } else synchronized(cacheHeaders) { + byte[] cacheEntry = null; + cacheEntry = cacheHeaders.getb(this.handle.index); + if (cacheEntry == null) { + // cache miss, we read overhead and key from file + readMiss++; + if (fillTail) { + // read complete record + byte[] chunkbuffer = new byte[recordsize]; + entryFile.readFully(seekpos(this.handle), chunkbuffer, 0, recordsize); + this.headChunk = new byte[headchunksize]; + this.tailChunk = new byte[tailchunksize]; + System.arraycopy(chunkbuffer, 0, this.headChunk, 0, headchunksize); + System.arraycopy(chunkbuffer, headchunksize, this.tailChunk, 0, tailchunksize); + chunkbuffer = null; + } else { + // read overhead and key + this.headChunk = new byte[headchunksize]; + this.tailChunk = null; + entryFile.readFully(seekpos(this.handle), this.headChunk, 0, headchunksize); + } + + // if space left in cache, copy these value to the cache + updateNodeCache(); + } else { + readHit++; + this.headChunk = cacheEntry; + } + } + } + + private void setValue(byte[] value, int valueoffset, int valuewidth, byte[] targetarray, int targetoffset) { + if (value == null) { + while (valuewidth-- > 0) targetarray[targetoffset++] = 0; + } else { + assert ((valueoffset >= 0) && (valueoffset < value.length)) : "valueoffset = " + valueoffset; + assert ((valueoffset + valuewidth <= value.length)) : "valueoffset = " + valueoffset + ", valuewidth = " + valuewidth + ", value.length = " + value.length; + assert ((targetoffset >= 0) && (targetoffset < targetarray.length)) : "targetoffset = " + targetoffset; + assert ((targetoffset + valuewidth <= targetarray.length)) : "targetoffset = " + targetoffset + ", valuewidth = " + valuewidth + ", targetarray.length = " + targetarray.length; + System.arraycopy(value, valueoffset, targetarray, targetoffset, Math.min(value.length, valuewidth)); // error? + while (valuewidth-- > value.length) targetarray[targetoffset + valuewidth] = 0; + } + } + + public kelondroHandle handle() { + // if this entry has an index, return it + if (this.handle.index == kelondroHandle.NUL) throw new kelondroException(filename, "the entry has no index assigned"); + return this.handle; + } + + public void setOHByte(int i, byte b) { + if (i >= OHBYTEC) throw new IllegalArgumentException("setOHByte: wrong index " + i); + if (this.handle.index == kelondroHandle.NUL) throw new kelondroException(filename, "setOHByte: no handle assigned"); + this.headChunk[i] = b; + this.headChanged = true; + } + + public void setOHHandle(int i, kelondroHandle otherhandle) { + assert (i < OHHANDLEC): "setOHHandle: wrong array size " + i; + assert (this.handle.index != kelondroHandle.NUL): "setOHHandle: no handle assigned ind file" + filename; + if (otherhandle == null) { + NUL2bytes(this.headChunk, OHBYTEC + 4 * i); + } else { + if (otherhandle.index >= USAGE.allCount()) throw new kelondroException(filename, "INTERNAL ERROR, setOHHandles: handle " + i + " exceeds file size (" + handle.index + " >= " + USAGE.allCount() + ")"); + int2bytes(otherhandle.index, this.headChunk, OHBYTEC + 4 * i); + } + this.headChanged = true; + } + + public byte getOHByte(int i) { + if (i >= OHBYTEC) throw new IllegalArgumentException("getOHByte: wrong index " + i); + if (this.handle.index == kelondroHandle.NUL) throw new kelondroException(filename, "Cannot load OH values"); + return this.headChunk[i]; + } + + public kelondroHandle getOHHandle(int i) { + if (this.handle.index == kelondroHandle.NUL) throw new kelondroException(filename, "Cannot load OH values"); + assert (i < OHHANDLEC): "handle index out of bounds: " + i + " in file " + filename; + int h = bytes2int(this.headChunk, OHBYTEC + 4 * i); + return (h == kelondroHandle.NUL) ? null : new kelondroHandle(h); + } + + public synchronized void setValueRow(byte[] row) throws IOException { + // if the index is defined, then write values directly to the file, else only to the object + if ((row != null) && (row.length != ROW.objectsize())) throw new IOException("setValueRow with wrong (" + row.length + ") row length instead correct: " + ROW.objectsize()); + + // set values + if (this.handle.index != kelondroHandle.NUL) { + setValue(row, 0, ROW.width(0), headChunk, overhead); + if (ROW.columns() > 1) setValue(row, ROW.width(0), tailchunksize, tailChunk, 0); + } + this.headChanged = true; + this.tailChanged = true; + } + + public synchronized boolean valid() { + // returns true if the key starts with non-zero byte + // this may help to detect deleted entries + return (headChunk[overhead] != 0) && ((headChunk[overhead] != -128) || (headChunk[overhead + 1] != 0)); + } + + public synchronized byte[] getKey() { + // read key + return trimCopy(headChunk, overhead, ROW.width(0)); + } + + public synchronized byte[] getValueRow() throws IOException { + + if (this.tailChunk == null) { + // load all values from the database file + this.tailChunk = new byte[tailchunksize]; + // read values + entryFile.readFully(seekpos(this.handle) + (long) headchunksize, this.tailChunk, 0, this.tailChunk.length); + } + + // create return value + byte[] row = new byte[ROW.objectsize()]; + + // read key + System.arraycopy(headChunk, overhead, row, 0, ROW.width(0)); + + // read remaining values + System.arraycopy(tailChunk, 0, row, ROW.width(0), tailchunksize); + + return row; + } + + public synchronized void commit() throws IOException { + // this must be called after all write operations to the node are + // finished + + // place the data to the file + + if (this.headChunk == null) { + // there is nothing to save + throw new kelondroException(filename, "no values to save (header missing)"); + } + + boolean doCommit = this.headChanged || this.tailChanged; + + // save head + synchronized (entryFile) { + if (this.headChanged) { + //System.out.println("WRITEH(" + filename + ", " + seekpos(this.handle) + ", " + this.headChunk.length + ")"); + assert (headChunk == null) || (headChunk.length == headchunksize); + entryFile.write(seekpos(this.handle), (this.headChunk == null) ? new byte[headchunksize] : this.headChunk); + updateNodeCache(); + this.headChanged = false; + } + + // save tail + if ((this.tailChunk != null) && (this.tailChanged)) { + //System.out.println("WRITET(" + filename + ", " + (seekpos(this.handle) + headchunksize) + ", " + this.tailChunk.length + ")"); + assert (tailChunk == null) || (tailChunk.length == tailchunksize); + entryFile.write(seekpos(this.handle) + headchunksize, (this.tailChunk == null) ? new byte[tailchunksize] : this.tailChunk); + this.tailChanged = false; + } + + if (doCommit) entryFile.commit(); + } + } + + public String toString() { + if (this.handle.index == kelondroHandle.NUL) return "NULL"; + String s = Integer.toHexString(this.handle.index); + kelondroHandle h; + while (s.length() < 4) s = "0" + s; + try { + for (int i = 0; i < OHBYTEC; i++) s = s + ":b" + getOHByte(i); + for (int i = 0; i < OHHANDLEC; i++) { + h = getOHHandle(i); + if (h == null) s = s + ":hNULL"; else s = s + ":h" + h.toString(); + } + kelondroRow.Entry content = row().newEntry(getValueRow()); + for (int i = 0; i < row().columns(); i++) s = s + ":" + ((content.empty(i)) ? "NULL" : content.getColString(i, "UTF-8").trim()); + } catch (IOException e) { + s = s + ":***LOAD ERROR***:" + e.getMessage(); + } + return s; + } + + private boolean cacheSpace() { + // check for space in cache + // should be only called within a synchronized(cacheHeaders) environment + // returns true if it is allowed to add another entry to the cache + // returns false if the cache is considered to be full + if (cacheHeaders == null) return false; // no caching + if (cacheHeaders.size() == 0) return true; // nothing there to flush + if (cacheGrowStatus() == 2) return true; // no need to flush cache space + + // just delete any of the entries + if (cacheGrowStatus() <= 1) synchronized (cacheHeaders) { + cacheHeaders.removeoneb(); + cacheFlush++; + } + return cacheGrowStatus() > 0; + } + + private void updateNodeCache() { + if (this.handle == null) return; // wrong access + if (this.headChunk == null) return; // nothing there to cache + if (cacheHeaders == null) return; // we do not use the cache + if (cacheSpace()) synchronized (cacheHeaders) { + // generate cache entry + //byte[] cacheEntry = new byte[headchunksize]; + //System.arraycopy(headChunk, 0, cacheEntry, 0, headchunksize); + + // store the cache entry + boolean upd = false; + upd = (cacheHeaders.putb(this.handle.index, headChunk) != null); + if (upd) writeDouble++; else writeUnique++; + + //System.out.println("kelondroRecords cache4" + filename + ": cache record size = " + (memBefore - Runtime.getRuntime().freeMemory()) + " bytes" + ((newentry) ? " new" : "")); + //printCache(); + } else { + // there shall be no entry in the cache. If one exists, we remove it + boolean rem = false; + rem = (cacheHeaders.removeb(this.handle.index) != null); + if (rem) cacheDelete++; + } + } + } + +} diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index b404459f6..9bcfc3165 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -1,1059 +1,1063 @@ -package de.anomic.kelondro; - -// a collectionIndex is an index to kelondroRowCollection objects -// such a collection ist defined by the following parameters -// - chunksize -// - chunkcount -// each of such a collection is stored in a byte[] which may or may not have space for more chunks -// than already exists in such an array. To store these arrays, we reserve entries in kelondroArray -// database files. There will be a set of array files for different sizes of the collection arrays. -// the 1st file has space for chunks, the 2nd file for * chunks, -// the 3rd file for ^^3 chunks, and the n-th file for ^^n chunks. -// if the loadfactor is 4, then we have the following capacities: -// file 0: 4 -// file 1: 16 -// file 2: 64 -// file 3: 256 -// file 4: 1024 -// file 5: 4096 -// file 6:16384 -// file 7:65536 -// the maximum number of such files is called the partitions number. -// we don't want that these files grow too big, an kelondroOutOfLimitsException is throws if they -// are oversized. -// the collection arrays may be migration to another size during run-time, which means that not only the -// partitions as mentioned above are maintained, but also a set of "shadow-partitions", that represent old -// partitions and where data is read only and slowly migrated to the default partitions. - -import java.io.File; -import java.io.IOException; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.GregorianCalendar; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.Set; -import java.util.TimeZone; -import java.util.TreeMap; - -import de.anomic.index.indexContainer; -import de.anomic.server.serverCodings; -import de.anomic.server.serverFileUtils; -import de.anomic.server.serverMemory; -import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacyURL; - -public class kelondroCollectionIndex { - - private static final int serialNumber = 0; - - private kelondroIndex index; - private int keylength; - private File path; - private String filenameStub; - private File commonsPath; - private int loadfactor; - private Map arrays; // Map of (partitionNumber"-"chunksize)/kelondroFixedWidthArray - Objects - private kelondroRow payloadrow; // definition of the payload (chunks inside the collections) - private int maxPartitions; // this is the maxmimum number of array files; yet not used - - private static final int idx_col_key = 0; // the index - private static final int idx_col_chunksize = 1; // chunksize (number of bytes in a single chunk, needed for migration option) - private static final int idx_col_chunkcount = 2; // chunkcount (number of chunks in this collection) - private static final int idx_col_clusteridx = 3; // selector for right cluster file, must be >= arrayIndex(chunkcount) - private static final int idx_col_flags = 4; // flags (for future use) - private static final int idx_col_indexpos = 5; // indexpos (position in array file) - private static final int idx_col_lastread = 6; // a time stamp, update time in days since 1.1.2000 - private static final int idx_col_lastwrote = 7; // a time stamp, update time in days since 1.1.2000 - - private static kelondroRow indexRow(int keylength, kelondroOrder payloadOrder) { - return new kelondroRow( - "byte[] key-" + keylength + "," + - "int chunksize-4 {b256}," + - "int chunkcount-4 {b256}," + - "byte clusteridx-1 {b256}," + - "byte flags-1 {b256}," + - "int indexpos-4 {b256}," + - "short lastread-2 {b256}, " + - "short lastwrote-2 {b256}", - payloadOrder, 0 - ); - } - - public kelondroRow payloadRow() { - return this.payloadrow; - } - - private static String fillZ(String s, int len) { - while (s.length() < len) s = "0" + s; - return s; - } - - private static File arrayFile(File path, String filenameStub, int loadfactor, int chunksize, int partitionNumber, int serialNumber) { - String lf = fillZ(Integer.toHexString(loadfactor).toUpperCase(), 2); - String cs = fillZ(Integer.toHexString(chunksize).toUpperCase(), 4); - String pn = fillZ(Integer.toHexString(partitionNumber).toUpperCase(), 2); - String sn = fillZ(Integer.toHexString(serialNumber).toUpperCase(), 2); - return new File(path, filenameStub + "." + lf + "." + cs + "." + pn + "." + sn + ".kca"); // kelondro collection array - } - - private static File propertyFile(File path, String filenameStub, int loadfactor, int chunksize) { - String lf = fillZ(Integer.toHexString(loadfactor).toUpperCase(), 2); - String cs = fillZ(Integer.toHexString(chunksize).toUpperCase(), 4); - return new File(path, filenameStub + "." + lf + "." + cs + ".properties"); - } - - public kelondroCollectionIndex(File path, String filenameStub, int keyLength, kelondroOrder indexOrder, - long preloadTime, int loadfactor, int maxpartitions, kelondroRow rowdef) throws IOException { - // the buffersize is number of bytes that are only used if the kelondroFlexTable is backed up with a kelondroTree - this.path = path; - this.filenameStub = filenameStub; - this.keylength = keyLength; - this.payloadrow = rowdef; - this.loadfactor = loadfactor; - this.maxPartitions = maxpartitions; - this.commonsPath = new File(path, filenameStub + "." + fillZ(Integer.toHexString(rowdef.objectsize).toUpperCase(), 4) + ".commons"); - this.commonsPath.mkdirs(); - - boolean ramIndexGeneration = false; - boolean fileIndexGeneration = !(new File(path, filenameStub + ".index").exists()); - if (ramIndexGeneration) index = new kelondroRowSet(indexRow(keyLength, indexOrder), 0); - if (fileIndexGeneration) index = new kelondroFlexTable(path, filenameStub + ".index", preloadTime, indexRow(keyLength, indexOrder), true); - - // open array files - this.arrays = new HashMap(); // all entries will be dynamically created with getArray() - if (((fileIndexGeneration) || (ramIndexGeneration))) { - serverLog.logFine("STARTUP", "STARTED INITIALIZATION OF NEW COLLECTION INDEX. THIS WILL TAKE SOME TIME"); - openAllArrayFiles(((fileIndexGeneration) || (ramIndexGeneration)), indexOrder); - } - - // open/create index table - if (index == null) index = openIndexFile(path, filenameStub, indexOrder, preloadTime, loadfactor, rowdef); - } - - private void openAllArrayFiles(boolean indexGeneration, kelondroOrder indexOrder) throws IOException { - String[] list = this.path.list(); - kelondroFixedWidthArray array; - - kelondroRow irow = indexRow(keylength, indexOrder); - int t = kelondroRowCollection.daysSince2000(System.currentTimeMillis()); - for (int i = 0; i < list.length; i++) if (list[i].endsWith(".kca")) { - - // open array - int pos = list[i].indexOf('.'); - if (pos < 0) continue; - int chunksize = Integer.parseInt(list[i].substring(pos + 4, pos + 8), 16); - int partitionNumber = Integer.parseInt(list[i].substring(pos + 9, pos + 11), 16); - int serialNumber = Integer.parseInt(list[i].substring(pos + 12, pos + 14), 16); - try { - array = openArrayFile(partitionNumber, serialNumber, true); - } catch (IOException e) { - e.printStackTrace(); - continue; - } - - // remember that we opened the array - arrays.put(partitionNumber + "-" + chunksize, array); - - if ((index != null) && (indexGeneration)) { - // loop over all elements in array and create index entry for each row - kelondroRow.EntryIndex aentry; - kelondroRow.Entry ientry; - Iterator ei = array.contentRows(-1); - byte[] key; - long start = System.currentTimeMillis(); - long lastlog = start; - int count = 0; - while (ei.hasNext()) { - aentry = (kelondroRow.EntryIndex) ei.next(); - key = aentry.getColBytes(0); - assert (key != null); - if (key == null) continue; // skip deleted entries - ientry = irow.newEntry(); - ientry.setCol(idx_col_key, key); - ientry.setCol(idx_col_chunksize, chunksize); - ientry.setCol(idx_col_chunkcount, kelondroRowCollection.sizeOfExportedCollectionRows(aentry, 1)); - ientry.setCol(idx_col_clusteridx, (byte) partitionNumber); - ientry.setCol(idx_col_flags, (byte) 0); - ientry.setCol(idx_col_indexpos, aentry.index()); - ientry.setCol(idx_col_lastread, t); - ientry.setCol(idx_col_lastwrote, t); - index.addUnique(ientry); // FIXME: this should avoid doubles - count++; - - // write a log - if (System.currentTimeMillis() - lastlog > 30000) { - serverLog.logFine("STARTUP", "created " + count + " RWI index entries. " + (((System.currentTimeMillis() - start) * (array.size() + array.free() - count) / count) / 60000) + " minutes remaining for this array"); - lastlog = System.currentTimeMillis(); - } - } - } - } - } - - private kelondroIndex openIndexFile(File path, String filenameStub, kelondroOrder indexOrder, - long preloadTime, int loadfactor, kelondroRow rowdef) throws IOException { - // open/create index table - kelondroIndex theindex = new kelondroCache(new kelondroFlexTable(path, filenameStub + ".index", preloadTime, indexRow(keylength, indexOrder), true), true, false); - //kelondroIndex theindex = new kelondroFlexTable(path, filenameStub + ".index", preloadTime, indexRow(keylength, indexOrder), true); - - // save/check property file for this array - File propfile = propertyFile(path, filenameStub, loadfactor, rowdef.objectsize()); - Map props = new HashMap(); - if (propfile.exists()) { - props = serverFileUtils.loadHashMap(propfile); - String stored_rowdef = (String) props.get("rowdef"); - if ((stored_rowdef == null) || (!(rowdef.subsumes(new kelondroRow(stored_rowdef, rowdef.objectOrder, 0))))) { - System.out.println("FATAL ERROR: stored rowdef '" + stored_rowdef + "' does not match with new rowdef '" + - rowdef + "' for array cluster '" + path + "/" + filenameStub + "'"); - System.exit(-1); - } - } - props.put("rowdef", rowdef.toString()); - serverFileUtils.saveMap(propfile, props, "CollectionIndex properties"); - - return theindex; - } - - private kelondroFixedWidthArray openArrayFile(int partitionNumber, int serialNumber, boolean create) throws IOException { - File f = arrayFile(path, filenameStub, loadfactor, payloadrow.objectsize(), partitionNumber, serialNumber); - int load = arrayCapacity(partitionNumber); - kelondroRow rowdef = new kelondroRow( - "byte[] key-" + keylength + "," + - "byte[] collection-" + (kelondroRowCollection.exportOverheadSize + load * this.payloadrow.objectsize()), - index.row().objectOrder, - 0 - ); - if ((!(f.exists())) && (!create)) return null; - kelondroFixedWidthArray a = new kelondroFixedWidthArray(f, rowdef, 0); - serverLog.logFine("STARTUP", "opened array file " + f + " with " + a.size() + " RWIs"); - return a; - } - - private kelondroFixedWidthArray getArray(int partitionNumber, int serialNumber, int chunksize) { - String accessKey = partitionNumber + "-" + chunksize; - kelondroFixedWidthArray array = (kelondroFixedWidthArray) arrays.get(accessKey); - if (array != null) return array; - try { - array = openArrayFile(partitionNumber, serialNumber, true); - } catch (IOException e) { - return null; - } - arrays.put(accessKey, array); - return array; - } - - private int arrayCapacity(int arrayCounter) { - if (arrayCounter < 0) return 0; - int load = this.loadfactor; - for (int i = 0; i < arrayCounter; i++) load = load * this.loadfactor; - return load; - } - - private int arrayIndex(int requestedCapacity) throws kelondroOutOfLimitsException{ - // the requestedCapacity is the number of wanted chunks - int load = 1, i = 0; - while (true) { - load = load * this.loadfactor; - if (load >= requestedCapacity) return i; - i++; - } - } - - public int size() { - return index.size(); - } - - public int minMem() { - // calculate a minimum amount of memory that is necessary to use the collection - // during runtime (after the index was initialized) - - // caclculate an upper limit (not the correct size) of the maximum number of indexes for a wordHash - // this is computed by the size of the biggest used collection - // this must be multiplied with the payload size - // and doubled for necessary memory transformation during sort operation - return (int) (arrayCapacity(arrays.size() - 1) * this.payloadrow.objectsize * kelondroRowSet.growfactor); - } - - private void array_remove( - int oldPartitionNumber, int serialNumber, int chunkSize, - int oldRownumber) throws IOException { - // we need a new slot, that means we must first delete the old entry - // find array file - kelondroFixedWidthArray array = getArray(oldPartitionNumber, serialNumber, chunkSize); - - // delete old entry - array.remove(oldRownumber); - } - - private kelondroRow.Entry array_new( - byte[] key, kelondroRowCollection collection) throws IOException { - // the collection is new - int partitionNumber = arrayIndex(collection.size()); - kelondroRow.Entry indexrow = index.row().newEntry(); - kelondroFixedWidthArray array = getArray(partitionNumber, serialNumber, this.payloadrow.objectsize()); - - // define row - kelondroRow.Entry arrayEntry = array.row().newEntry(); - arrayEntry.setCol(0, key); - arrayEntry.setCol(1, collection.exportCollection()); - - // write a new entry in this array - int newRowNumber = array.add(arrayEntry); - - // store the new row number in the index - indexrow.setCol(idx_col_key, key); - indexrow.setCol(idx_col_chunksize, this.payloadrow.objectsize()); - indexrow.setCol(idx_col_chunkcount, collection.size()); - indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); - indexrow.setCol(idx_col_flags, (byte) 0); - indexrow.setCol(idx_col_indexpos, (long) newRowNumber); - indexrow.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); - indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); - - // after calling this method there must be an index.addUnique(indexrow); - return indexrow; - } - - private void array_add( - byte[] key, kelondroRowCollection collection, kelondroRow.Entry indexrow, - int partitionNumber, int serialNumber, int chunkSize) throws IOException { - - // write a new entry in the other array - kelondroFixedWidthArray array = getArray(partitionNumber, serialNumber, chunkSize); - - // define new row - kelondroRow.Entry arrayEntry = array.row().newEntry(); - arrayEntry.setCol(0, key); - arrayEntry.setCol(1, collection.exportCollection()); - - // write a new entry in this array - int rowNumber = array.add(arrayEntry); - - // store the new row number in the index - indexrow.setCol(idx_col_chunkcount, collection.size()); - indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); - indexrow.setCol(idx_col_indexpos, (long) rowNumber); - indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); - - // after calling this method there must be a index.put(indexrow); - } - - private ArrayList array_add_multiple(TreeMap array_add_map, int serialNumber, int chunkSize) throws IOException { - // returns a List of kelondroRow.Entry entries for indexrow storage - Map.Entry entry; - Iterator i = array_add_map.entrySet().iterator(); - Iterator j; - ArrayList actionList; - int partitionNumber; - kelondroFixedWidthArray array; - Object[] objs; - byte[] key; - kelondroRowCollection collection; - kelondroRow.Entry indexrow; - ArrayList indexrows = new ArrayList(); - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - actionList = (ArrayList) entry.getValue(); - partitionNumber = ((Integer) entry.getKey()).intValue(); - array = getArray(partitionNumber, serialNumber, chunkSize); - - j = actionList.iterator(); - while (j.hasNext()) { - objs = (Object[]) j.next(); - key = (byte[]) objs[0]; - collection = (kelondroRowCollection) objs[1]; - indexrow = (kelondroRow.Entry) objs[2]; - - // define new row - kelondroRow.Entry arrayEntry = array.row().newEntry(); - arrayEntry.setCol(0, key); - arrayEntry.setCol(1, collection.exportCollection()); - - // write a new entry in this array - int rowNumber = array.add(arrayEntry); - - // store the new row number in the index - indexrow.setCol(idx_col_chunkcount, collection.size()); - indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); - indexrow.setCol(idx_col_indexpos, (long) rowNumber); - indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); - indexrows.add(indexrow); - } - } - // after calling this method there must be a index.put(indexrow); - return indexrows; - } - - private void array_replace( - byte[] key, kelondroRowCollection collection, kelondroRow.Entry indexrow, - int partitionNumber, int serialNumber, int chunkSize, - int rowNumber) throws IOException { - // we don't need a new slot, just write collection into the old one - - // find array file - kelondroFixedWidthArray array = getArray(partitionNumber, serialNumber, chunkSize); - - // define new row - kelondroRow.Entry arrayEntry = array.row().newEntry(); - arrayEntry.setCol(0, key); - arrayEntry.setCol(1, collection.exportCollection()); - - // overwrite entry in this array - array.set(rowNumber, arrayEntry); - - // update the index entry - final int collectionsize = collection.size(); // extra variable for easier debugging - indexrow.setCol(idx_col_chunkcount, collectionsize); - indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); - indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); - - // after calling this method there must be a index.put(indexrow); - } - - private ArrayList array_replace_multiple(TreeMap array_replace_map, int serialNumber, int chunkSize) throws IOException { - Map.Entry entry, e; - Iterator i = array_replace_map.entrySet().iterator(); - Iterator j; - TreeMap actionMap; - int partitionNumber; - kelondroFixedWidthArray array; - ArrayList indexrows = new ArrayList(); - Object[] objs; - int rowNumber; - byte[] key; - kelondroRowCollection collection; - kelondroRow.Entry indexrow; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - actionMap = (TreeMap) entry.getValue(); - partitionNumber = ((Integer) entry.getKey()).intValue(); - array = getArray(partitionNumber, serialNumber, chunkSize); - - j = actionMap.entrySet().iterator(); - while (j.hasNext()) { - e = (Map.Entry) j.next(); - rowNumber = ((Integer) e.getKey()).intValue(); - objs = (Object[]) e.getValue(); - key = (byte[]) objs[0]; - collection = (kelondroRowCollection) objs[1]; - indexrow = (kelondroRow.Entry) objs[2]; - - // define new row - kelondroRow.Entry arrayEntry = array.row().newEntry(); - arrayEntry.setCol(0, key); - arrayEntry.setCol(1, collection.exportCollection()); - - // overwrite entry in this array - array.set(rowNumber, arrayEntry); - - // update the index entry - indexrow.setCol(idx_col_chunkcount, collection.size()); - indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); - indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); - indexrows.add(indexrow); - } - } - // after calling this method there mus be a index.put(indexrow); - return indexrows; - } - - public synchronized void put(byte[] key, kelondroRowCollection collection) throws IOException, kelondroOutOfLimitsException { - assert (key != null); - assert (collection != null); - assert (collection.size() != 0); - - // first find an old entry, if one exists - kelondroRow.Entry indexrow = index.get(key); - - if (indexrow == null) { - // create new row and index entry - if ((collection != null) && (collection.size() > 0)) { - indexrow = array_new(key, collection); // modifies indexrow - index.addUnique(indexrow); - } - return; - } - - // overwrite the old collection - // read old information - //int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration - int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection - int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array - int oldPartitionNumber = (int) indexrow.getColByte(idx_col_clusteridx); // points to array file - assert (oldPartitionNumber >= arrayIndex(oldchunkcount)); - - int newPartitionNumber = arrayIndex(collection.size()); - - // see if we need new space or if we can overwrite the old space - if (oldPartitionNumber == newPartitionNumber) { - array_replace( - key, collection, indexrow, - oldPartitionNumber, serialNumber, this.payloadrow.objectsize(), - oldrownumber); // modifies indexrow - } else { - array_remove( - oldPartitionNumber, serialNumber, this.payloadrow.objectsize(), - oldrownumber); - array_add( - key, collection, indexrow, - newPartitionNumber, serialNumber, this.payloadrow.objectsize()); // modifies indexrow - } - - if ((int) indexrow.getColLong(idx_col_chunkcount) != collection.size()) - serverLog.logSevere("kelondroCollectionIndex", "UPDATE (put) ERROR: array has different chunkcount than index after merge: index = " + (int) indexrow.getColLong(idx_col_chunkcount) + ", collection.size() = " + collection.size()); - - index.put(indexrow); // write modified indexrow - } - - public synchronized void mergeMultiple(List /* of indexContainer */ containerList) throws IOException, kelondroOutOfLimitsException { - // merge a bulk of index containers - // this method should be used to optimize the R/W head path length - - // separate the list in two halves: - // - containers that do not exist yet in the collection - // - containers that do exist in the collection and must be merged - Iterator i = containerList.iterator(); - indexContainer container; - byte[] key; - ArrayList newContainer = new ArrayList(); - TreeMap existingContainer = new TreeMap(); // a mapping from Integer (partition) to a TreeMap (mapping from index to object triple) - TreeMap containerMap; // temporary map; mapping from index position to object triple with {key, container, indexrow} - kelondroRow.Entry indexrow; - int oldrownumber1; // index of the entry in array - int oldPartitionNumber1; // points to array file - while (i.hasNext()) { - container = (indexContainer) i.next(); - - if ((container == null) || (container.size() == 0)) continue; - key = container.getWordHash().getBytes(); - - // first find an old entry, if one exists - indexrow = index.get(key); - if (indexrow == null) { - newContainer.add(new Object[]{key, container}); - } else { - oldrownumber1 = (int) indexrow.getColLong(idx_col_indexpos); - oldPartitionNumber1 = (int) indexrow.getColByte(idx_col_clusteridx); - containerMap = (TreeMap) existingContainer.get(new Integer(oldPartitionNumber1)); - if (containerMap == null) containerMap = new TreeMap(); - containerMap.put(new Integer(oldrownumber1), new Object[]{key, container, indexrow}); - existingContainer.put(new Integer(oldPartitionNumber1), containerMap); - } - } - - // now iterate through the container lists and execute merges - // this is done in such a way, that there is a optimized path for the R/W head - - // merge existing containers - Map.Entry tripleEntry; - Object[] record; - ArrayList indexrows_existing = new ArrayList(); - kelondroRowCollection collection; - TreeMap array_replace_map = new TreeMap(); - TreeMap array_add_map = new TreeMap(); - ArrayList actionList; - TreeMap actionMap; - boolean madegc = false; - //System.out.println("DEBUG existingContainer: " + existingContainer.toString()); - while (existingContainer.size() > 0) { - oldPartitionNumber1 = ((Integer) existingContainer.lastKey()).intValue(); - containerMap = (TreeMap) existingContainer.remove(new Integer(oldPartitionNumber1)); - Iterator j = containerMap.entrySet().iterator(); - while (j.hasNext()) { - tripleEntry = (Map.Entry) j.next(); - oldrownumber1 = ((Integer) tripleEntry.getKey()).intValue(); - record = (Object[]) tripleEntry.getValue(); // {byte[], indexContainer, kelondroRow.Entry} - - // merge with the old collection - key = (byte[]) record[0]; - collection = (kelondroRowCollection) record[1]; - indexrow = (kelondroRow.Entry) record[2]; - - // read old information - int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration - int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection - int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array - int oldPartitionNumber = (int) indexrow.getColByte(idx_col_clusteridx); // points to array file - assert oldPartitionNumber1 == oldPartitionNumber : "oldPartitionNumber1 = " + oldPartitionNumber1 + ", oldPartitionNumber = " + oldPartitionNumber + ", containerMap = " + containerMap + ", existingContainer: " + existingContainer.toString(); - assert oldrownumber1 == oldrownumber : "oldrownumber1 = " + oldrownumber1 + ", oldrownumber = " + oldrownumber + ", containerMap = " + containerMap + ", existingContainer: " + existingContainer.toString(); - assert (oldPartitionNumber >= arrayIndex(oldchunkcount)); - int oldSerialNumber = 0; - - // load the old collection and join it - collection.addAllUnique(getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false)); - collection.sort(); - collection.uniq(-1); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries - collection.trim(false); - - // check for size of collection: - // if necessary shrink the collection and dump a part of that collection - // to avoid that this grows too big - if (arrayIndex(collection.size()) > maxPartitions) { - shrinkCollection(key, collection, arrayCapacity(maxPartitions)); - } - - // determine new partition position - int newPartitionNumber = arrayIndex(collection.size()); - - // see if we need new space or if we can overwrite the old space - if (oldPartitionNumber == newPartitionNumber) { - actionMap = (TreeMap) array_replace_map.get(new Integer(oldPartitionNumber)); - if (actionMap == null) actionMap = new TreeMap(); - actionMap.put(new Integer(oldrownumber), new Object[]{key, collection, indexrow}); - array_replace_map.put(new Integer(oldPartitionNumber), actionMap); - /* - array_replace( - key, collection, indexrow, - oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize(), - oldrownumber); // modifies indexrow - indexrows_existing.add(indexrow); // indexrows are collected and written later as block - */ - } else { - array_remove( - oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize(), - oldrownumber); - - actionList = (ArrayList) array_add_map.get(new Integer(newPartitionNumber)); - if (actionList == null) actionList = new ArrayList(); - actionList.add(new Object[]{key, collection, indexrow}); - array_add_map.put(new Integer(newPartitionNumber), actionList); - /* - array_add( - key, collection, indexrow, - newPartitionNumber, oldSerialNumber, this.payloadrow.objectsize()); // modifies indexrow - indexrows_existing.add(indexrow); // indexrows are collected and written later as block - */ - } - - // memory protection: flush collected collections - if (serverMemory.available() < minMem()) { - // emergency flush - indexrows_existing.addAll(array_replace_multiple(array_replace_map, 0, this.payloadrow.objectsize())); - array_replace_map = new TreeMap(); // delete references - indexrows_existing.addAll(array_add_multiple(array_add_map, 0, this.payloadrow.objectsize())); - array_add_map = new TreeMap(); // delete references - if (!madegc) { - // prevent that this flush is made again even when there is enough memory - System.gc(); - // prevent that this gc happens more than one time - madegc = true; - } - } - } - } - - // finallly flush the collected collections - indexrows_existing.addAll(array_replace_multiple(array_replace_map, 0, this.payloadrow.objectsize())); - array_replace_map = new TreeMap(); // delete references - indexrows_existing.addAll(array_add_multiple(array_add_map, 0, this.payloadrow.objectsize())); - array_add_map = new TreeMap(); // delete references - - // write new containers - i = newContainer.iterator(); - ArrayList indexrows_new = new ArrayList(); - while (i.hasNext()) { - record = (Object[]) i.next(); // {byte[], indexContainer} - key = (byte[]) record[0]; - collection = (indexContainer) record[1]; - indexrow = array_new(key, collection); // modifies indexrow - indexrows_new.add(indexrow); // collect new index rows - } - - // write index entries - index.putMultiple(indexrows_existing); // write modified indexrows in optimized manner - index.addUniqueMultiple(indexrows_new); // write new indexrows in optimized manner - } - - public synchronized void merge(indexContainer container) throws IOException, kelondroOutOfLimitsException { - if ((container == null) || (container.size() == 0)) return; - byte[] key = container.getWordHash().getBytes(); - - // first find an old entry, if one exists - kelondroRow.Entry indexrow = index.get(key); - if (indexrow == null) { - indexrow = array_new(key, container); // modifies indexrow - index.addUnique(indexrow); // write modified indexrow - } else { - // merge with the old collection - // attention! this modifies the indexrow entry which must be written with index.put(indexrow) afterwards! - kelondroRowCollection collection = (kelondroRowCollection) container; - - // read old information - int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration - int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection - int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array - int oldPartitionNumber = (int) indexrow.getColByte(idx_col_clusteridx); // points to array file - assert (oldPartitionNumber >= arrayIndex(oldchunkcount)) : "oldPartitionNumber = " + oldPartitionNumber + ", arrayIndex(oldchunkcount) = " + arrayIndex(oldchunkcount); - int oldSerialNumber = 0; - - // load the old collection and join it - collection.addAllUnique(getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false)); - collection.sort(); - collection.uniq(-1); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries - collection.trim(false); - - // check for size of collection: - // if necessary shrink the collection and dump a part of that collection - // to avoid that this grows too big - if (arrayIndex(collection.size()) > maxPartitions) { - shrinkCollection(key, collection, arrayCapacity(maxPartitions)); - } - - // determine new partition location - int newPartitionNumber = arrayIndex(collection.size()); - - // see if we need new space or if we can overwrite the old space - if (oldPartitionNumber == newPartitionNumber) { - array_replace( - key, collection, indexrow, - oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize(), - oldrownumber); // modifies indexrow - } else { - array_remove( - oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize(), - oldrownumber); - array_add( - key, collection, indexrow, - newPartitionNumber, oldSerialNumber, this.payloadrow.objectsize()); // modifies indexrow - } - - final int collectionsize = collection.size(); // extra variable for easier debugging - final int indexrowcount = (int) indexrow.getColLong(idx_col_chunkcount); - if (indexrowcount != collectionsize) - serverLog.logSevere("kelondroCollectionIndex", "UPDATE (merge) ERROR: array has different chunkcount than index after merge: index = " + indexrowcount + ", collection.size() = " + collectionsize); - - index.put(indexrow); // write modified indexrow - } - } - - private void shrinkCollection(byte[] key, kelondroRowCollection collection, int targetSize) { - //TODO Remove timing before release - // removes entries from collection - // the removed entries are stored in a 'commons' dump file - - if (key.length != 12) return; - // check if the collection is already small enough - int oldsize = collection.size(); - if (oldsize <= targetSize) return; - kelondroRowSet newcommon = new kelondroRowSet(collection.rowdef, 0); - long sadd1 = 0, srem1 = 0, sadd2 = 0, srem2 = 0, tot1 = 0, tot2 = 0; - long t1 = 0, t2 = 0; - - // delete some entries, which are bad rated - Iterator i = collection.rows(); - kelondroRow.Entry entry; - byte[] ref; - t1 = System.currentTimeMillis(); - while (i.hasNext()) { - entry = (kelondroRow.Entry) i.next(); - ref = entry.getColBytes(0); - if ((ref.length != 12) || (!yacyURL.probablyRootURL(new String(ref)))) { - t2 = System.currentTimeMillis(); - newcommon.addUnique(entry); - sadd1 += System.currentTimeMillis() - t2; - t2 = System.currentTimeMillis(); - i.remove(); - srem1 += System.currentTimeMillis() - t2; - } - } - int firstnewcommon = newcommon.size(); - tot1 = System.currentTimeMillis() - t1; - - // check if we shrinked enough - Random rand = new Random(System.currentTimeMillis()); - t1 = System.currentTimeMillis(); - while (collection.size() > targetSize) { - // now delete randomly more entries from the survival collection - i = collection.rows(); - while (i.hasNext()) { - entry = (kelondroRow.Entry) i.next(); - ref = entry.getColBytes(0); - if (rand.nextInt() % 4 != 0) { - t2 = System.currentTimeMillis(); - newcommon.addUnique(entry); - sadd2 += System.currentTimeMillis() - t2; - t2 = System.currentTimeMillis(); - i.remove(); - srem2 += System.currentTimeMillis() - t2; - } - } - } - tot2 = System.currentTimeMillis() - t1; - collection.trim(false); - - serverLog.logFine("kelondroCollectionIndex", "tot= "+tot1+'/'+tot2+" # add/rem(1)= "+sadd1+'/'+srem1+" # add/rem(2)= "+sadd2+'/'+srem2); - serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon); - - // finally dump the removed entries to a file - newcommon.sort(); - TimeZone GMTTimeZone = TimeZone.getTimeZone("GMT"); - Calendar gregorian = new GregorianCalendar(GMTTimeZone); - SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss"); - String filename = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(new String(key))) + "_" + formatter.format(gregorian.getTime()) + ".collection"; - File storagePath = new File(commonsPath, filename.substring(0, 2)); // make a subpath - storagePath.mkdirs(); - File file = new File(storagePath, filename); - try { - newcommon.saveCollection(file); - serverLog.logInfo("kelondroCollectionIndex", "dumped common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size()); - } catch (IOException e) { - e.printStackTrace(); - serverLog.logWarning("kelondroCollectionIndex", "failed to dump common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size()); - } - - } - - public synchronized int remove(byte[] key, Set removekeys) throws IOException, kelondroOutOfLimitsException { - - if ((removekeys == null) || (removekeys.size() == 0)) return 0; - - // first find an old entry, if one exists - kelondroRow.Entry indexrow = index.get(key); - - if (indexrow == null) return 0; - - // overwrite the old collection - // read old information - int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration - int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection - int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array - int oldPartitionNumber = (int) indexrow.getColByte(idx_col_clusteridx); // points to array file - assert (oldPartitionNumber >= arrayIndex(oldchunkcount)); - - int removed = 0; - assert (removekeys != null); - // load the old collection and remove keys - kelondroRowSet oldcollection = getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, serialNumber, false); - - // remove the keys from the set - Iterator i = removekeys.iterator(); - Object k; - while (i.hasNext()) { - k = i.next(); - if ((k instanceof byte[]) && (oldcollection.remove((byte[]) k, false) != null)) removed++; - if ((k instanceof String) && (oldcollection.remove(((String) k).getBytes(), false) != null)) removed++; - } - oldcollection.sort(); - oldcollection.trim(false); - - /* in case that the new array size is zero we dont delete the array, just allocate a minimal chunk - * - - if (oldcollection.size() == 0) { - // delete the index entry and the array - kelondroFixedWidthArray array = getArray(oldPartitionNumber, serialNumber, oldchunksize); - array.remove(oldrownumber, false); - index.remove(key); - return removed; - } - */ - int newPartitionNumber = arrayIndex(oldcollection.size()); - - // see if we need new space or if we can overwrite the old space - if (oldPartitionNumber == newPartitionNumber) { - array_replace( - key, oldcollection, indexrow, - oldPartitionNumber, serialNumber, this.payloadrow.objectsize(), - oldrownumber); // modifies indexrow - } else { - array_remove( - oldPartitionNumber, serialNumber, this.payloadrow.objectsize(), - oldrownumber); - array_add( - key, oldcollection, indexrow, - newPartitionNumber, serialNumber, this.payloadrow.objectsize()); // modifies indexrow - } - index.put(indexrow); // write modified indexrow - return removed; - } - - public synchronized int indexSize(byte[] key) throws IOException { - kelondroRow.Entry indexrow = index.get(key); - if (indexrow == null) return 0; - return (int) indexrow.getColLong(idx_col_chunkcount); - } - - public synchronized boolean has(byte[] key) throws IOException { - return index.has(key); - } - - public synchronized kelondroRowSet get(byte[] key) throws IOException { - // find an entry, if one exists - kelondroRow.Entry indexrow = index.get(key); - if (indexrow == null) return null; - kelondroRowSet col = getdelete(indexrow, false); - assert (col != null); - return col; - } - - public synchronized kelondroRowSet delete(byte[] key) throws IOException { - // find an entry, if one exists - kelondroRow.Entry indexrow = index.remove(key, false); - if (indexrow == null) return null; - kelondroRowSet removedCollection = getdelete(indexrow, true); - assert (removedCollection != null); - return removedCollection; - } - - protected kelondroRowSet getdelete(kelondroRow.Entry indexrow, boolean remove) throws IOException { - // call this only within a synchronized(index) environment - - // read values - int chunksize = (int) indexrow.getColLong(idx_col_chunksize); - int chunkcount = (int) indexrow.getColLong(idx_col_chunkcount); - int rownumber = (int) indexrow.getColLong(idx_col_indexpos); - int partitionnumber = (int) indexrow.getColByte(idx_col_clusteridx); - assert(partitionnumber >= arrayIndex(chunkcount)) : "partitionnumber = " + partitionnumber + ", arrayIndex(chunkcount) = " + arrayIndex(chunkcount); - int serialnumber = 0; - - return getwithparams(indexrow, chunksize, chunkcount, partitionnumber, rownumber, serialnumber, remove); - } - - private synchronized kelondroRowSet getwithparams(kelondroRow.Entry indexrow, int chunksize, int chunkcount, int clusteridx, int rownumber, int serialnumber, boolean remove) throws IOException { - // open array entry - kelondroFixedWidthArray array = getArray(clusteridx, serialnumber, chunksize); - kelondroRow.Entry arrayrow = array.get(rownumber); - if (arrayrow == null) throw new kelondroException(arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, clusteridx, serialnumber).toString(), "array does not contain expected row"); - - // read the row and define a collection - byte[] indexkey = indexrow.getColBytes(idx_col_key); - byte[] arraykey = arrayrow.getColBytes(0); - if (!(index.row().objectOrder.wellformed(arraykey))) { - // cleanup for a bad bug that corrupted the database - index.remove(indexkey, false); // the RowCollection must be considered lost - array.remove(rownumber); // loose the RowCollection (we don't know how much is lost) - serverLog.logSevere("kelondroCollectionIndex." + array.filename, "lost a RowCollection because of a bad arraykey"); - return new kelondroRowSet(this.payloadrow, 0); - } - kelondroRowSet collection = new kelondroRowSet(this.payloadrow, arrayrow, 1); // FIXME: this does not yet work with different rowdef in case of several rowdef.objectsize() - if ((!(index.row().objectOrder.wellformed(indexkey))) || (index.row().objectOrder.compare(arraykey, indexkey) != 0)) { - // check if we got the right row; this row is wrong. Fix it: - index.remove(indexkey, true); // the wrong row cannot be fixed - // store the row number in the index; this may be a double-entry, but better than nothing - kelondroRow.Entry indexEntry = index.row().newEntry(); - indexEntry.setCol(idx_col_key, arrayrow.getColBytes(0)); - indexEntry.setCol(idx_col_chunksize, this.payloadrow.objectsize()); - indexEntry.setCol(idx_col_chunkcount, collection.size()); - indexEntry.setCol(idx_col_clusteridx, (byte) clusteridx); - indexEntry.setCol(idx_col_flags, (byte) 0); - indexEntry.setCol(idx_col_indexpos, (long) rownumber); - indexEntry.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); - indexEntry.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); - index.put(indexEntry); - serverLog.logSevere("kelondroCollectionIndex." + array.filename, "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed"); - } - int chunkcountInArray = collection.size(); - if (chunkcountInArray != chunkcount) { - // fix the entry in index - indexrow.setCol(idx_col_chunkcount, chunkcountInArray); - index.put(indexrow); - array.logFailure("INCONSISTENCY (get) in " + arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, clusteridx, serialnumber).toString() + ": array has different chunkcount than index: index = " + chunkcount + ", array = " + chunkcountInArray + "; the index has been auto-fixed"); - } - if (remove) array.remove(rownumber); // index is removed in calling method - return collection; - } - - public synchronized Iterator keycollections(byte[] startKey, byte[] secondKey, boolean rot) { - // returns an iteration of {byte[], kelondroRowSet} Objects - try { - return new keycollectionIterator(startKey, secondKey, rot); - } catch (IOException e) { - e.printStackTrace(); - return null; - } - } - - public class keycollectionIterator implements Iterator { - - Iterator indexRowIterator; - - public keycollectionIterator(byte[] startKey, byte[] secondKey, boolean rot) throws IOException { - // iterator of {byte[], kelondroRowSet} Objects - kelondroCloneableIterator i = index.rows(true, startKey); - indexRowIterator = (rot) ? new kelondroRotateIterator(i, secondKey) : i; - } - - public boolean hasNext() { - return indexRowIterator.hasNext(); - } - - public Object next() { - kelondroRow.Entry indexrow = (kelondroRow.Entry) indexRowIterator.next(); - assert (indexrow != null); - if (indexrow == null) return null; - try { - return new Object[]{indexrow.getColBytes(0), getdelete(indexrow, false)}; - } catch (IOException e) { - e.printStackTrace(); - return null; - } - } - - public void remove() { - indexRowIterator.remove(); - } - - } - - public synchronized void close() { - this.index.close(); - Iterator i = arrays.values().iterator(); - while (i.hasNext()) { - ((kelondroFixedWidthArray) i.next()).close(); - } - } - - public static void main(String[] args) { - - // define payload structure - kelondroRow rowdef = new kelondroRow("byte[] a-10, byte[] b-80", kelondroNaturalOrder.naturalOrder, 0); - - File path = new File(args[0]); - String filenameStub = args[1]; - long preloadTime = 10000; - try { - // initialize collection index - kelondroCollectionIndex collectionIndex = new kelondroCollectionIndex( - path, filenameStub, 9 /*keyLength*/, - kelondroNaturalOrder.naturalOrder, preloadTime, - 4 /*loadfactor*/, 7, rowdef); - - // fill index with values - kelondroRowSet collection = new kelondroRowSet(rowdef, 0); - collection.addUnique(rowdef.newEntry(new byte[][]{"abc".getBytes(), "efg".getBytes()})); - collectionIndex.put("erstes".getBytes(), collection); - - for (int i = 1; i <= 170; i++) { - collection = new kelondroRowSet(rowdef, 0); - for (int j = 0; j < i; j++) { - collection.addUnique(rowdef.newEntry(new byte[][]{("abc" + j).getBytes(), "xxx".getBytes()})); - } - System.out.println("put key-" + i + ": " + collection.toString()); - collectionIndex.put(("key-" + i).getBytes(), collection); - } - - // extend collections with more values - for (int i = 0; i <= 170; i++) { - collection = new kelondroRowSet(rowdef, 0); - for (int j = 0; j < i; j++) { - collection.addUnique(rowdef.newEntry(new byte[][]{("def" + j).getBytes(), "xxx".getBytes()})); - } - collectionIndex.merge(new indexContainer("key-" + i, collection)); - } - - // printout of index - collectionIndex.close(); - kelondroFlexTable index = new kelondroFlexTable(path, filenameStub + ".index", preloadTime, kelondroCollectionIndex.indexRow(9, kelondroNaturalOrder.naturalOrder), true); - index.print(); - index.close(); - } catch (IOException e) { - e.printStackTrace(); - } - - } -} +package de.anomic.kelondro; + +// a collectionIndex is an index to kelondroRowCollection objects +// such a collection ist defined by the following parameters +// - chunksize +// - chunkcount +// each of such a collection is stored in a byte[] which may or may not have space for more chunks +// than already exists in such an array. To store these arrays, we reserve entries in kelondroArray +// database files. There will be a set of array files for different sizes of the collection arrays. +// the 1st file has space for chunks, the 2nd file for * chunks, +// the 3rd file for ^^3 chunks, and the n-th file for ^^n chunks. +// if the loadfactor is 4, then we have the following capacities: +// file 0: 4 +// file 1: 16 +// file 2: 64 +// file 3: 256 +// file 4: 1024 +// file 5: 4096 +// file 6:16384 +// file 7:65536 +// the maximum number of such files is called the partitions number. +// we don't want that these files grow too big, an kelondroOutOfLimitsException is throws if they +// are oversized. +// the collection arrays may be migration to another size during run-time, which means that not only the +// partitions as mentioned above are maintained, but also a set of "shadow-partitions", that represent old +// partitions and where data is read only and slowly migrated to the default partitions. +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ + +import java.io.File; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.GregorianCalendar; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.TimeZone; +import java.util.TreeMap; + +import de.anomic.index.indexContainer; +import de.anomic.server.serverCodings; +import de.anomic.server.serverFileUtils; +import de.anomic.server.serverMemory; +import de.anomic.server.logging.serverLog; +import de.anomic.yacy.yacyURL; + +public class kelondroCollectionIndex { + + private static final int serialNumber = 0; + + private kelondroIndex index; + private int keylength; + private File path; + private String filenameStub; + private File commonsPath; + private int loadfactor; + private Map arrays; // Map of (partitionNumber"-"chunksize)/kelondroFixedWidthArray - Objects + private kelondroRow payloadrow; // definition of the payload (chunks inside the collections) + private int maxPartitions; // this is the maxmimum number of array files; yet not used + + private static final int idx_col_key = 0; // the index + private static final int idx_col_chunksize = 1; // chunksize (number of bytes in a single chunk, needed for migration option) + private static final int idx_col_chunkcount = 2; // chunkcount (number of chunks in this collection) + private static final int idx_col_clusteridx = 3; // selector for right cluster file, must be >= arrayIndex(chunkcount) + private static final int idx_col_flags = 4; // flags (for future use) + private static final int idx_col_indexpos = 5; // indexpos (position in array file) + private static final int idx_col_lastread = 6; // a time stamp, update time in days since 1.1.2000 + private static final int idx_col_lastwrote = 7; // a time stamp, update time in days since 1.1.2000 + + private static kelondroRow indexRow(int keylength, kelondroOrder payloadOrder) { + return new kelondroRow( + "byte[] key-" + keylength + "," + + "int chunksize-4 {b256}," + + "int chunkcount-4 {b256}," + + "byte clusteridx-1 {b256}," + + "byte flags-1 {b256}," + + "int indexpos-4 {b256}," + + "short lastread-2 {b256}, " + + "short lastwrote-2 {b256}", + payloadOrder, 0 + ); + } + + public kelondroRow payloadRow() { + return this.payloadrow; + } + + private static String fillZ(String s, int len) { + while (s.length() < len) s = "0" + s; + return s; + } + + private static File arrayFile(File path, String filenameStub, int loadfactor, int chunksize, int partitionNumber, int serialNumber) { + String lf = fillZ(Integer.toHexString(loadfactor).toUpperCase(), 2); + String cs = fillZ(Integer.toHexString(chunksize).toUpperCase(), 4); + String pn = fillZ(Integer.toHexString(partitionNumber).toUpperCase(), 2); + String sn = fillZ(Integer.toHexString(serialNumber).toUpperCase(), 2); + return new File(path, filenameStub + "." + lf + "." + cs + "." + pn + "." + sn + ".kca"); // kelondro collection array + } + + private static File propertyFile(File path, String filenameStub, int loadfactor, int chunksize) { + String lf = fillZ(Integer.toHexString(loadfactor).toUpperCase(), 2); + String cs = fillZ(Integer.toHexString(chunksize).toUpperCase(), 4); + return new File(path, filenameStub + "." + lf + "." + cs + ".properties"); + } + + public kelondroCollectionIndex(File path, String filenameStub, int keyLength, kelondroOrder indexOrder, + long preloadTime, int loadfactor, int maxpartitions, kelondroRow rowdef) throws IOException { + // the buffersize is number of bytes that are only used if the kelondroFlexTable is backed up with a kelondroTree + this.path = path; + this.filenameStub = filenameStub; + this.keylength = keyLength; + this.payloadrow = rowdef; + this.loadfactor = loadfactor; + this.maxPartitions = maxpartitions; + this.commonsPath = new File(path, filenameStub + "." + fillZ(Integer.toHexString(rowdef.objectsize).toUpperCase(), 4) + ".commons"); + this.commonsPath.mkdirs(); + + boolean ramIndexGeneration = false; + boolean fileIndexGeneration = !(new File(path, filenameStub + ".index").exists()); + if (ramIndexGeneration) index = new kelondroRowSet(indexRow(keyLength, indexOrder), 0); + if (fileIndexGeneration) index = new kelondroFlexTable(path, filenameStub + ".index", preloadTime, indexRow(keyLength, indexOrder), true); + + // open array files + this.arrays = new HashMap(); // all entries will be dynamically created with getArray() + if (((fileIndexGeneration) || (ramIndexGeneration))) { + serverLog.logFine("STARTUP", "STARTED INITIALIZATION OF NEW COLLECTION INDEX. THIS WILL TAKE SOME TIME"); + openAllArrayFiles(((fileIndexGeneration) || (ramIndexGeneration)), indexOrder); + } + + // open/create index table + if (index == null) index = openIndexFile(path, filenameStub, indexOrder, preloadTime, loadfactor, rowdef); + } + + private void openAllArrayFiles(boolean indexGeneration, kelondroOrder indexOrder) throws IOException { + String[] list = this.path.list(); + kelondroFixedWidthArray array; + + kelondroRow irow = indexRow(keylength, indexOrder); + int t = kelondroRowCollection.daysSince2000(System.currentTimeMillis()); + for (int i = 0; i < list.length; i++) if (list[i].endsWith(".kca")) { + + // open array + int pos = list[i].indexOf('.'); + if (pos < 0) continue; + int chunksize = Integer.parseInt(list[i].substring(pos + 4, pos + 8), 16); + int partitionNumber = Integer.parseInt(list[i].substring(pos + 9, pos + 11), 16); + int serialNumber = Integer.parseInt(list[i].substring(pos + 12, pos + 14), 16); + try { + array = openArrayFile(partitionNumber, serialNumber, true); + } catch (IOException e) { + e.printStackTrace(); + continue; + } + + // remember that we opened the array + arrays.put(partitionNumber + "-" + chunksize, array); + + if ((index != null) && (indexGeneration)) { + // loop over all elements in array and create index entry for each row + kelondroRow.EntryIndex aentry; + kelondroRow.Entry ientry; + Iterator ei = array.contentRows(-1); + byte[] key; + long start = System.currentTimeMillis(); + long lastlog = start; + int count = 0; + while (ei.hasNext()) { + aentry = (kelondroRow.EntryIndex) ei.next(); + key = aentry.getColBytes(0); + assert (key != null); + if (key == null) continue; // skip deleted entries + ientry = irow.newEntry(); + ientry.setCol(idx_col_key, key); + ientry.setCol(idx_col_chunksize, chunksize); + ientry.setCol(idx_col_chunkcount, kelondroRowCollection.sizeOfExportedCollectionRows(aentry, 1)); + ientry.setCol(idx_col_clusteridx, (byte) partitionNumber); + ientry.setCol(idx_col_flags, (byte) 0); + ientry.setCol(idx_col_indexpos, aentry.index()); + ientry.setCol(idx_col_lastread, t); + ientry.setCol(idx_col_lastwrote, t); + index.addUnique(ientry); // FIXME: this should avoid doubles + count++; + + // write a log + if (System.currentTimeMillis() - lastlog > 30000) { + serverLog.logFine("STARTUP", "created " + count + " RWI index entries. " + (((System.currentTimeMillis() - start) * (array.size() + array.free() - count) / count) / 60000) + " minutes remaining for this array"); + lastlog = System.currentTimeMillis(); + } + } + } + } + } + + private kelondroIndex openIndexFile(File path, String filenameStub, kelondroOrder indexOrder, + long preloadTime, int loadfactor, kelondroRow rowdef) throws IOException { + // open/create index table + kelondroIndex theindex = new kelondroCache(new kelondroFlexTable(path, filenameStub + ".index", preloadTime, indexRow(keylength, indexOrder), true), true, false); + //kelondroIndex theindex = new kelondroFlexTable(path, filenameStub + ".index", preloadTime, indexRow(keylength, indexOrder), true); + + // save/check property file for this array + File propfile = propertyFile(path, filenameStub, loadfactor, rowdef.objectsize()); + Map props = new HashMap(); + if (propfile.exists()) { + props = serverFileUtils.loadHashMap(propfile); + String stored_rowdef = (String) props.get("rowdef"); + if ((stored_rowdef == null) || (!(rowdef.subsumes(new kelondroRow(stored_rowdef, rowdef.objectOrder, 0))))) { + System.out.println("FATAL ERROR: stored rowdef '" + stored_rowdef + "' does not match with new rowdef '" + + rowdef + "' for array cluster '" + path + "/" + filenameStub + "'"); + System.exit(-1); + } + } + props.put("rowdef", rowdef.toString()); + serverFileUtils.saveMap(propfile, props, "CollectionIndex properties"); + + return theindex; + } + + private kelondroFixedWidthArray openArrayFile(int partitionNumber, int serialNumber, boolean create) throws IOException { + File f = arrayFile(path, filenameStub, loadfactor, payloadrow.objectsize(), partitionNumber, serialNumber); + int load = arrayCapacity(partitionNumber); + kelondroRow rowdef = new kelondroRow( + "byte[] key-" + keylength + "," + + "byte[] collection-" + (kelondroRowCollection.exportOverheadSize + load * this.payloadrow.objectsize()), + index.row().objectOrder, + 0 + ); + if ((!(f.exists())) && (!create)) return null; + kelondroFixedWidthArray a = new kelondroFixedWidthArray(f, rowdef, 0); + serverLog.logFine("STARTUP", "opened array file " + f + " with " + a.size() + " RWIs"); + return a; + } + + private kelondroFixedWidthArray getArray(int partitionNumber, int serialNumber, int chunksize) { + String accessKey = partitionNumber + "-" + chunksize; + kelondroFixedWidthArray array = (kelondroFixedWidthArray) arrays.get(accessKey); + if (array != null) return array; + try { + array = openArrayFile(partitionNumber, serialNumber, true); + } catch (IOException e) { + return null; + } + arrays.put(accessKey, array); + return array; + } + + private int arrayCapacity(int arrayCounter) { + if (arrayCounter < 0) return 0; + int load = this.loadfactor; + for (int i = 0; i < arrayCounter; i++) load = load * this.loadfactor; + return load; + } + + private int arrayIndex(int requestedCapacity) throws kelondroOutOfLimitsException{ + // the requestedCapacity is the number of wanted chunks + int load = 1, i = 0; + while (true) { + load = load * this.loadfactor; + if (load >= requestedCapacity) return i; + i++; + } + } + + public int size() { + return index.size(); + } + + public int minMem() { + // calculate a minimum amount of memory that is necessary to use the collection + // during runtime (after the index was initialized) + + // caclculate an upper limit (not the correct size) of the maximum number of indexes for a wordHash + // this is computed by the size of the biggest used collection + // this must be multiplied with the payload size + // and doubled for necessary memory transformation during sort operation + return (int) (arrayCapacity(arrays.size() - 1) * this.payloadrow.objectsize * kelondroRowSet.growfactor); + } + + private void array_remove( + int oldPartitionNumber, int serialNumber, int chunkSize, + int oldRownumber) throws IOException { + // we need a new slot, that means we must first delete the old entry + // find array file + kelondroFixedWidthArray array = getArray(oldPartitionNumber, serialNumber, chunkSize); + + // delete old entry + array.remove(oldRownumber); + } + + private kelondroRow.Entry array_new( + byte[] key, kelondroRowCollection collection) throws IOException { + // the collection is new + int partitionNumber = arrayIndex(collection.size()); + kelondroRow.Entry indexrow = index.row().newEntry(); + kelondroFixedWidthArray array = getArray(partitionNumber, serialNumber, this.payloadrow.objectsize()); + + // define row + kelondroRow.Entry arrayEntry = array.row().newEntry(); + arrayEntry.setCol(0, key); + arrayEntry.setCol(1, collection.exportCollection()); + + // write a new entry in this array + int newRowNumber = array.add(arrayEntry); + + // store the new row number in the index + indexrow.setCol(idx_col_key, key); + indexrow.setCol(idx_col_chunksize, this.payloadrow.objectsize()); + indexrow.setCol(idx_col_chunkcount, collection.size()); + indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); + indexrow.setCol(idx_col_flags, (byte) 0); + indexrow.setCol(idx_col_indexpos, (long) newRowNumber); + indexrow.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); + indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); + + // after calling this method there must be an index.addUnique(indexrow); + return indexrow; + } + + private void array_add( + byte[] key, kelondroRowCollection collection, kelondroRow.Entry indexrow, + int partitionNumber, int serialNumber, int chunkSize) throws IOException { + + // write a new entry in the other array + kelondroFixedWidthArray array = getArray(partitionNumber, serialNumber, chunkSize); + + // define new row + kelondroRow.Entry arrayEntry = array.row().newEntry(); + arrayEntry.setCol(0, key); + arrayEntry.setCol(1, collection.exportCollection()); + + // write a new entry in this array + int rowNumber = array.add(arrayEntry); + + // store the new row number in the index + indexrow.setCol(idx_col_chunkcount, collection.size()); + indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); + indexrow.setCol(idx_col_indexpos, (long) rowNumber); + indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); + + // after calling this method there must be a index.put(indexrow); + } + + private ArrayList array_add_multiple(TreeMap array_add_map, int serialNumber, int chunkSize) throws IOException { + // returns a List of kelondroRow.Entry entries for indexrow storage + Map.Entry entry; + Iterator i = array_add_map.entrySet().iterator(); + Iterator j; + ArrayList actionList; + int partitionNumber; + kelondroFixedWidthArray array; + Object[] objs; + byte[] key; + kelondroRowCollection collection; + kelondroRow.Entry indexrow; + ArrayList indexrows = new ArrayList(); + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + actionList = (ArrayList) entry.getValue(); + partitionNumber = ((Integer) entry.getKey()).intValue(); + array = getArray(partitionNumber, serialNumber, chunkSize); + + j = actionList.iterator(); + while (j.hasNext()) { + objs = (Object[]) j.next(); + key = (byte[]) objs[0]; + collection = (kelondroRowCollection) objs[1]; + indexrow = (kelondroRow.Entry) objs[2]; + + // define new row + kelondroRow.Entry arrayEntry = array.row().newEntry(); + arrayEntry.setCol(0, key); + arrayEntry.setCol(1, collection.exportCollection()); + + // write a new entry in this array + int rowNumber = array.add(arrayEntry); + + // store the new row number in the index + indexrow.setCol(idx_col_chunkcount, collection.size()); + indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); + indexrow.setCol(idx_col_indexpos, (long) rowNumber); + indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); + indexrows.add(indexrow); + } + } + // after calling this method there must be a index.put(indexrow); + return indexrows; + } + + private void array_replace( + byte[] key, kelondroRowCollection collection, kelondroRow.Entry indexrow, + int partitionNumber, int serialNumber, int chunkSize, + int rowNumber) throws IOException { + // we don't need a new slot, just write collection into the old one + + // find array file + kelondroFixedWidthArray array = getArray(partitionNumber, serialNumber, chunkSize); + + // define new row + kelondroRow.Entry arrayEntry = array.row().newEntry(); + arrayEntry.setCol(0, key); + arrayEntry.setCol(1, collection.exportCollection()); + + // overwrite entry in this array + array.set(rowNumber, arrayEntry); + + // update the index entry + final int collectionsize = collection.size(); // extra variable for easier debugging + indexrow.setCol(idx_col_chunkcount, collectionsize); + indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); + indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); + + // after calling this method there must be a index.put(indexrow); + } + + private ArrayList array_replace_multiple(TreeMap array_replace_map, int serialNumber, int chunkSize) throws IOException { + Map.Entry entry, e; + Iterator i = array_replace_map.entrySet().iterator(); + Iterator j; + TreeMap actionMap; + int partitionNumber; + kelondroFixedWidthArray array; + ArrayList indexrows = new ArrayList(); + Object[] objs; + int rowNumber; + byte[] key; + kelondroRowCollection collection; + kelondroRow.Entry indexrow; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + actionMap = (TreeMap) entry.getValue(); + partitionNumber = ((Integer) entry.getKey()).intValue(); + array = getArray(partitionNumber, serialNumber, chunkSize); + + j = actionMap.entrySet().iterator(); + while (j.hasNext()) { + e = (Map.Entry) j.next(); + rowNumber = ((Integer) e.getKey()).intValue(); + objs = (Object[]) e.getValue(); + key = (byte[]) objs[0]; + collection = (kelondroRowCollection) objs[1]; + indexrow = (kelondroRow.Entry) objs[2]; + + // define new row + kelondroRow.Entry arrayEntry = array.row().newEntry(); + arrayEntry.setCol(0, key); + arrayEntry.setCol(1, collection.exportCollection()); + + // overwrite entry in this array + array.set(rowNumber, arrayEntry); + + // update the index entry + indexrow.setCol(idx_col_chunkcount, collection.size()); + indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); + indexrow.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); + indexrows.add(indexrow); + } + } + // after calling this method there mus be a index.put(indexrow); + return indexrows; + } + + public synchronized void put(byte[] key, kelondroRowCollection collection) throws IOException, kelondroOutOfLimitsException { + assert (key != null); + assert (collection != null); + assert (collection.size() != 0); + + // first find an old entry, if one exists + kelondroRow.Entry indexrow = index.get(key); + + if (indexrow == null) { + // create new row and index entry + if ((collection != null) && (collection.size() > 0)) { + indexrow = array_new(key, collection); // modifies indexrow + index.addUnique(indexrow); + } + return; + } + + // overwrite the old collection + // read old information + //int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration + int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection + int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array + int oldPartitionNumber = (int) indexrow.getColByte(idx_col_clusteridx); // points to array file + assert (oldPartitionNumber >= arrayIndex(oldchunkcount)); + + int newPartitionNumber = arrayIndex(collection.size()); + + // see if we need new space or if we can overwrite the old space + if (oldPartitionNumber == newPartitionNumber) { + array_replace( + key, collection, indexrow, + oldPartitionNumber, serialNumber, this.payloadrow.objectsize(), + oldrownumber); // modifies indexrow + } else { + array_remove( + oldPartitionNumber, serialNumber, this.payloadrow.objectsize(), + oldrownumber); + array_add( + key, collection, indexrow, + newPartitionNumber, serialNumber, this.payloadrow.objectsize()); // modifies indexrow + } + + if ((int) indexrow.getColLong(idx_col_chunkcount) != collection.size()) + serverLog.logSevere("kelondroCollectionIndex", "UPDATE (put) ERROR: array has different chunkcount than index after merge: index = " + (int) indexrow.getColLong(idx_col_chunkcount) + ", collection.size() = " + collection.size()); + + index.put(indexrow); // write modified indexrow + } + + public synchronized void mergeMultiple(List /* of indexContainer */ containerList) throws IOException, kelondroOutOfLimitsException { + // merge a bulk of index containers + // this method should be used to optimize the R/W head path length + + // separate the list in two halves: + // - containers that do not exist yet in the collection + // - containers that do exist in the collection and must be merged + Iterator i = containerList.iterator(); + indexContainer container; + byte[] key; + ArrayList newContainer = new ArrayList(); + TreeMap existingContainer = new TreeMap(); // a mapping from Integer (partition) to a TreeMap (mapping from index to object triple) + TreeMap containerMap; // temporary map; mapping from index position to object triple with {key, container, indexrow} + kelondroRow.Entry indexrow; + int oldrownumber1; // index of the entry in array + int oldPartitionNumber1; // points to array file + while (i.hasNext()) { + container = (indexContainer) i.next(); + + if ((container == null) || (container.size() == 0)) continue; + key = container.getWordHash().getBytes(); + + // first find an old entry, if one exists + indexrow = index.get(key); + if (indexrow == null) { + newContainer.add(new Object[]{key, container}); + } else { + oldrownumber1 = (int) indexrow.getColLong(idx_col_indexpos); + oldPartitionNumber1 = (int) indexrow.getColByte(idx_col_clusteridx); + containerMap = (TreeMap) existingContainer.get(new Integer(oldPartitionNumber1)); + if (containerMap == null) containerMap = new TreeMap(); + containerMap.put(new Integer(oldrownumber1), new Object[]{key, container, indexrow}); + existingContainer.put(new Integer(oldPartitionNumber1), containerMap); + } + } + + // now iterate through the container lists and execute merges + // this is done in such a way, that there is a optimized path for the R/W head + + // merge existing containers + Map.Entry tripleEntry; + Object[] record; + ArrayList indexrows_existing = new ArrayList(); + kelondroRowCollection collection; + TreeMap array_replace_map = new TreeMap(); + TreeMap array_add_map = new TreeMap(); + ArrayList actionList; + TreeMap actionMap; + //boolean madegc = false; + //System.out.println("DEBUG existingContainer: " + existingContainer.toString()); + while (existingContainer.size() > 0) { + oldPartitionNumber1 = ((Integer) existingContainer.lastKey()).intValue(); + containerMap = (TreeMap) existingContainer.remove(new Integer(oldPartitionNumber1)); + Iterator j = containerMap.entrySet().iterator(); + while (j.hasNext()) { + tripleEntry = (Map.Entry) j.next(); + oldrownumber1 = ((Integer) tripleEntry.getKey()).intValue(); + record = (Object[]) tripleEntry.getValue(); // {byte[], indexContainer, kelondroRow.Entry} + + // merge with the old collection + key = (byte[]) record[0]; + collection = (kelondroRowCollection) record[1]; + indexrow = (kelondroRow.Entry) record[2]; + + // read old information + int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration + int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection + int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array + int oldPartitionNumber = (int) indexrow.getColByte(idx_col_clusteridx); // points to array file + assert oldPartitionNumber1 == oldPartitionNumber : "oldPartitionNumber1 = " + oldPartitionNumber1 + ", oldPartitionNumber = " + oldPartitionNumber + ", containerMap = " + containerMap + ", existingContainer: " + existingContainer.toString(); + assert oldrownumber1 == oldrownumber : "oldrownumber1 = " + oldrownumber1 + ", oldrownumber = " + oldrownumber + ", containerMap = " + containerMap + ", existingContainer: " + existingContainer.toString(); + assert (oldPartitionNumber >= arrayIndex(oldchunkcount)); + int oldSerialNumber = 0; + + // load the old collection and join it + collection.addAllUnique(getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false)); + collection.sort(); + collection.uniq(-1); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries + collection.trim(false); + + // check for size of collection: + // if necessary shrink the collection and dump a part of that collection + // to avoid that this grows too big + if (arrayIndex(collection.size()) > maxPartitions) { + shrinkCollection(key, collection, arrayCapacity(maxPartitions)); + } + + // determine new partition position + int newPartitionNumber = arrayIndex(collection.size()); + + // see if we need new space or if we can overwrite the old space + if (oldPartitionNumber == newPartitionNumber) { + actionMap = (TreeMap) array_replace_map.get(new Integer(oldPartitionNumber)); + if (actionMap == null) actionMap = new TreeMap(); + actionMap.put(new Integer(oldrownumber), new Object[]{key, collection, indexrow}); + array_replace_map.put(new Integer(oldPartitionNumber), actionMap); + /* + array_replace( + key, collection, indexrow, + oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize(), + oldrownumber); // modifies indexrow + indexrows_existing.add(indexrow); // indexrows are collected and written later as block + */ + } else { + array_remove( + oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize(), + oldrownumber); + + actionList = (ArrayList) array_add_map.get(new Integer(newPartitionNumber)); + if (actionList == null) actionList = new ArrayList(); + actionList.add(new Object[]{key, collection, indexrow}); + array_add_map.put(new Integer(newPartitionNumber), actionList); + /* + array_add( + key, collection, indexrow, + newPartitionNumber, oldSerialNumber, this.payloadrow.objectsize()); // modifies indexrow + indexrows_existing.add(indexrow); // indexrows are collected and written later as block + */ + } + + // memory protection: flush collected collections + if (serverMemory.available() < minMem()) { + // emergency flush + indexrows_existing.addAll(array_replace_multiple(array_replace_map, 0, this.payloadrow.objectsize())); + array_replace_map = new TreeMap(); // delete references + indexrows_existing.addAll(array_add_multiple(array_add_map, 0, this.payloadrow.objectsize())); + array_add_map = new TreeMap(); // delete references + //if (!madegc) { + // prevent that this flush is made again even when there is enough memory + serverMemory.gc(10000, "kelendroCollectionIndex.mergeMultiple(...)"); // thq + // prevent that this gc happens more than one time + // madegc = true; + //} + } + } + } + + // finallly flush the collected collections + indexrows_existing.addAll(array_replace_multiple(array_replace_map, 0, this.payloadrow.objectsize())); + array_replace_map = new TreeMap(); // delete references + indexrows_existing.addAll(array_add_multiple(array_add_map, 0, this.payloadrow.objectsize())); + array_add_map = new TreeMap(); // delete references + + // write new containers + i = newContainer.iterator(); + ArrayList indexrows_new = new ArrayList(); + while (i.hasNext()) { + record = (Object[]) i.next(); // {byte[], indexContainer} + key = (byte[]) record[0]; + collection = (indexContainer) record[1]; + indexrow = array_new(key, collection); // modifies indexrow + indexrows_new.add(indexrow); // collect new index rows + } + + // write index entries + index.putMultiple(indexrows_existing); // write modified indexrows in optimized manner + index.addUniqueMultiple(indexrows_new); // write new indexrows in optimized manner + } + + public synchronized void merge(indexContainer container) throws IOException, kelondroOutOfLimitsException { + if ((container == null) || (container.size() == 0)) return; + byte[] key = container.getWordHash().getBytes(); + + // first find an old entry, if one exists + kelondroRow.Entry indexrow = index.get(key); + if (indexrow == null) { + indexrow = array_new(key, container); // modifies indexrow + index.addUnique(indexrow); // write modified indexrow + } else { + // merge with the old collection + // attention! this modifies the indexrow entry which must be written with index.put(indexrow) afterwards! + kelondroRowCollection collection = (kelondroRowCollection) container; + + // read old information + int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration + int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection + int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array + int oldPartitionNumber = (int) indexrow.getColByte(idx_col_clusteridx); // points to array file + assert (oldPartitionNumber >= arrayIndex(oldchunkcount)) : "oldPartitionNumber = " + oldPartitionNumber + ", arrayIndex(oldchunkcount) = " + arrayIndex(oldchunkcount); + int oldSerialNumber = 0; + + // load the old collection and join it + collection.addAllUnique(getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false)); + collection.sort(); + collection.uniq(-1); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries + collection.trim(false); + + // check for size of collection: + // if necessary shrink the collection and dump a part of that collection + // to avoid that this grows too big + if (arrayIndex(collection.size()) > maxPartitions) { + shrinkCollection(key, collection, arrayCapacity(maxPartitions)); + } + + // determine new partition location + int newPartitionNumber = arrayIndex(collection.size()); + + // see if we need new space or if we can overwrite the old space + if (oldPartitionNumber == newPartitionNumber) { + array_replace( + key, collection, indexrow, + oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize(), + oldrownumber); // modifies indexrow + } else { + array_remove( + oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize(), + oldrownumber); + array_add( + key, collection, indexrow, + newPartitionNumber, oldSerialNumber, this.payloadrow.objectsize()); // modifies indexrow + } + + final int collectionsize = collection.size(); // extra variable for easier debugging + final int indexrowcount = (int) indexrow.getColLong(idx_col_chunkcount); + if (indexrowcount != collectionsize) + serverLog.logSevere("kelondroCollectionIndex", "UPDATE (merge) ERROR: array has different chunkcount than index after merge: index = " + indexrowcount + ", collection.size() = " + collectionsize); + + index.put(indexrow); // write modified indexrow + } + } + + private void shrinkCollection(byte[] key, kelondroRowCollection collection, int targetSize) { + //TODO Remove timing before release + // removes entries from collection + // the removed entries are stored in a 'commons' dump file + + if (key.length != 12) return; + // check if the collection is already small enough + int oldsize = collection.size(); + if (oldsize <= targetSize) return; + kelondroRowSet newcommon = new kelondroRowSet(collection.rowdef, 0); + long sadd1 = 0, srem1 = 0, sadd2 = 0, srem2 = 0, tot1 = 0, tot2 = 0; + long t1 = 0, t2 = 0; + + // delete some entries, which are bad rated + Iterator i = collection.rows(); + kelondroRow.Entry entry; + byte[] ref; + t1 = System.currentTimeMillis(); + while (i.hasNext()) { + entry = (kelondroRow.Entry) i.next(); + ref = entry.getColBytes(0); + if ((ref.length != 12) || (!yacyURL.probablyRootURL(new String(ref)))) { + t2 = System.currentTimeMillis(); + newcommon.addUnique(entry); + sadd1 += System.currentTimeMillis() - t2; + t2 = System.currentTimeMillis(); + i.remove(); + srem1 += System.currentTimeMillis() - t2; + } + } + int firstnewcommon = newcommon.size(); + tot1 = System.currentTimeMillis() - t1; + + // check if we shrinked enough + Random rand = new Random(System.currentTimeMillis()); + t1 = System.currentTimeMillis(); + while (collection.size() > targetSize) { + // now delete randomly more entries from the survival collection + i = collection.rows(); + while (i.hasNext()) { + entry = (kelondroRow.Entry) i.next(); + ref = entry.getColBytes(0); + if (rand.nextInt() % 4 != 0) { + t2 = System.currentTimeMillis(); + newcommon.addUnique(entry); + sadd2 += System.currentTimeMillis() - t2; + t2 = System.currentTimeMillis(); + i.remove(); + srem2 += System.currentTimeMillis() - t2; + } + } + } + tot2 = System.currentTimeMillis() - t1; + collection.trim(false); + + serverLog.logFine("kelondroCollectionIndex", "tot= "+tot1+'/'+tot2+" # add/rem(1)= "+sadd1+'/'+srem1+" # add/rem(2)= "+sadd2+'/'+srem2); + serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon); + + // finally dump the removed entries to a file + newcommon.sort(); + TimeZone GMTTimeZone = TimeZone.getTimeZone("GMT"); + Calendar gregorian = new GregorianCalendar(GMTTimeZone); + SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss"); + String filename = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(new String(key))) + "_" + formatter.format(gregorian.getTime()) + ".collection"; + File storagePath = new File(commonsPath, filename.substring(0, 2)); // make a subpath + storagePath.mkdirs(); + File file = new File(storagePath, filename); + try { + newcommon.saveCollection(file); + serverLog.logInfo("kelondroCollectionIndex", "dumped common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size()); + } catch (IOException e) { + e.printStackTrace(); + serverLog.logWarning("kelondroCollectionIndex", "failed to dump common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size()); + } + + } + + public synchronized int remove(byte[] key, Set removekeys) throws IOException, kelondroOutOfLimitsException { + + if ((removekeys == null) || (removekeys.size() == 0)) return 0; + + // first find an old entry, if one exists + kelondroRow.Entry indexrow = index.get(key); + + if (indexrow == null) return 0; + + // overwrite the old collection + // read old information + int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration + int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection + int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array + int oldPartitionNumber = (int) indexrow.getColByte(idx_col_clusteridx); // points to array file + assert (oldPartitionNumber >= arrayIndex(oldchunkcount)); + + int removed = 0; + assert (removekeys != null); + // load the old collection and remove keys + kelondroRowSet oldcollection = getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, serialNumber, false); + + // remove the keys from the set + Iterator i = removekeys.iterator(); + Object k; + while (i.hasNext()) { + k = i.next(); + if ((k instanceof byte[]) && (oldcollection.remove((byte[]) k, false) != null)) removed++; + if ((k instanceof String) && (oldcollection.remove(((String) k).getBytes(), false) != null)) removed++; + } + oldcollection.sort(); + oldcollection.trim(false); + + /* in case that the new array size is zero we dont delete the array, just allocate a minimal chunk + * + + if (oldcollection.size() == 0) { + // delete the index entry and the array + kelondroFixedWidthArray array = getArray(oldPartitionNumber, serialNumber, oldchunksize); + array.remove(oldrownumber, false); + index.remove(key); + return removed; + } + */ + int newPartitionNumber = arrayIndex(oldcollection.size()); + + // see if we need new space or if we can overwrite the old space + if (oldPartitionNumber == newPartitionNumber) { + array_replace( + key, oldcollection, indexrow, + oldPartitionNumber, serialNumber, this.payloadrow.objectsize(), + oldrownumber); // modifies indexrow + } else { + array_remove( + oldPartitionNumber, serialNumber, this.payloadrow.objectsize(), + oldrownumber); + array_add( + key, oldcollection, indexrow, + newPartitionNumber, serialNumber, this.payloadrow.objectsize()); // modifies indexrow + } + index.put(indexrow); // write modified indexrow + return removed; + } + + public synchronized int indexSize(byte[] key) throws IOException { + kelondroRow.Entry indexrow = index.get(key); + if (indexrow == null) return 0; + return (int) indexrow.getColLong(idx_col_chunkcount); + } + + public synchronized boolean has(byte[] key) throws IOException { + return index.has(key); + } + + public synchronized kelondroRowSet get(byte[] key) throws IOException { + // find an entry, if one exists + kelondroRow.Entry indexrow = index.get(key); + if (indexrow == null) return null; + kelondroRowSet col = getdelete(indexrow, false); + assert (col != null); + return col; + } + + public synchronized kelondroRowSet delete(byte[] key) throws IOException { + // find an entry, if one exists + kelondroRow.Entry indexrow = index.remove(key, false); + if (indexrow == null) return null; + kelondroRowSet removedCollection = getdelete(indexrow, true); + assert (removedCollection != null); + return removedCollection; + } + + protected kelondroRowSet getdelete(kelondroRow.Entry indexrow, boolean remove) throws IOException { + // call this only within a synchronized(index) environment + + // read values + int chunksize = (int) indexrow.getColLong(idx_col_chunksize); + int chunkcount = (int) indexrow.getColLong(idx_col_chunkcount); + int rownumber = (int) indexrow.getColLong(idx_col_indexpos); + int partitionnumber = (int) indexrow.getColByte(idx_col_clusteridx); + assert(partitionnumber >= arrayIndex(chunkcount)) : "partitionnumber = " + partitionnumber + ", arrayIndex(chunkcount) = " + arrayIndex(chunkcount); + int serialnumber = 0; + + return getwithparams(indexrow, chunksize, chunkcount, partitionnumber, rownumber, serialnumber, remove); + } + + private synchronized kelondroRowSet getwithparams(kelondroRow.Entry indexrow, int chunksize, int chunkcount, int clusteridx, int rownumber, int serialnumber, boolean remove) throws IOException { + // open array entry + kelondroFixedWidthArray array = getArray(clusteridx, serialnumber, chunksize); + kelondroRow.Entry arrayrow = array.get(rownumber); + if (arrayrow == null) throw new kelondroException(arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, clusteridx, serialnumber).toString(), "array does not contain expected row"); + + // read the row and define a collection + byte[] indexkey = indexrow.getColBytes(idx_col_key); + byte[] arraykey = arrayrow.getColBytes(0); + if (!(index.row().objectOrder.wellformed(arraykey))) { + // cleanup for a bad bug that corrupted the database + index.remove(indexkey, false); // the RowCollection must be considered lost + array.remove(rownumber); // loose the RowCollection (we don't know how much is lost) + serverLog.logSevere("kelondroCollectionIndex." + array.filename, "lost a RowCollection because of a bad arraykey"); + return new kelondroRowSet(this.payloadrow, 0); + } + kelondroRowSet collection = new kelondroRowSet(this.payloadrow, arrayrow, 1); // FIXME: this does not yet work with different rowdef in case of several rowdef.objectsize() + if ((!(index.row().objectOrder.wellformed(indexkey))) || (index.row().objectOrder.compare(arraykey, indexkey) != 0)) { + // check if we got the right row; this row is wrong. Fix it: + index.remove(indexkey, true); // the wrong row cannot be fixed + // store the row number in the index; this may be a double-entry, but better than nothing + kelondroRow.Entry indexEntry = index.row().newEntry(); + indexEntry.setCol(idx_col_key, arrayrow.getColBytes(0)); + indexEntry.setCol(idx_col_chunksize, this.payloadrow.objectsize()); + indexEntry.setCol(idx_col_chunkcount, collection.size()); + indexEntry.setCol(idx_col_clusteridx, (byte) clusteridx); + indexEntry.setCol(idx_col_flags, (byte) 0); + indexEntry.setCol(idx_col_indexpos, (long) rownumber); + indexEntry.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); + indexEntry.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); + index.put(indexEntry); + serverLog.logSevere("kelondroCollectionIndex." + array.filename, "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed"); + } + int chunkcountInArray = collection.size(); + if (chunkcountInArray != chunkcount) { + // fix the entry in index + indexrow.setCol(idx_col_chunkcount, chunkcountInArray); + index.put(indexrow); + array.logFailure("INCONSISTENCY (get) in " + arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, clusteridx, serialnumber).toString() + ": array has different chunkcount than index: index = " + chunkcount + ", array = " + chunkcountInArray + "; the index has been auto-fixed"); + } + if (remove) array.remove(rownumber); // index is removed in calling method + return collection; + } + + public synchronized Iterator keycollections(byte[] startKey, byte[] secondKey, boolean rot) { + // returns an iteration of {byte[], kelondroRowSet} Objects + try { + return new keycollectionIterator(startKey, secondKey, rot); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + public class keycollectionIterator implements Iterator { + + Iterator indexRowIterator; + + public keycollectionIterator(byte[] startKey, byte[] secondKey, boolean rot) throws IOException { + // iterator of {byte[], kelondroRowSet} Objects + kelondroCloneableIterator i = index.rows(true, startKey); + indexRowIterator = (rot) ? new kelondroRotateIterator(i, secondKey) : i; + } + + public boolean hasNext() { + return indexRowIterator.hasNext(); + } + + public Object next() { + kelondroRow.Entry indexrow = (kelondroRow.Entry) indexRowIterator.next(); + assert (indexrow != null); + if (indexrow == null) return null; + try { + return new Object[]{indexrow.getColBytes(0), getdelete(indexrow, false)}; + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + public void remove() { + indexRowIterator.remove(); + } + + } + + public synchronized void close() { + this.index.close(); + Iterator i = arrays.values().iterator(); + while (i.hasNext()) { + ((kelondroFixedWidthArray) i.next()).close(); + } + } + + public static void main(String[] args) { + + // define payload structure + kelondroRow rowdef = new kelondroRow("byte[] a-10, byte[] b-80", kelondroNaturalOrder.naturalOrder, 0); + + File path = new File(args[0]); + String filenameStub = args[1]; + long preloadTime = 10000; + try { + // initialize collection index + kelondroCollectionIndex collectionIndex = new kelondroCollectionIndex( + path, filenameStub, 9 /*keyLength*/, + kelondroNaturalOrder.naturalOrder, preloadTime, + 4 /*loadfactor*/, 7, rowdef); + + // fill index with values + kelondroRowSet collection = new kelondroRowSet(rowdef, 0); + collection.addUnique(rowdef.newEntry(new byte[][]{"abc".getBytes(), "efg".getBytes()})); + collectionIndex.put("erstes".getBytes(), collection); + + for (int i = 1; i <= 170; i++) { + collection = new kelondroRowSet(rowdef, 0); + for (int j = 0; j < i; j++) { + collection.addUnique(rowdef.newEntry(new byte[][]{("abc" + j).getBytes(), "xxx".getBytes()})); + } + System.out.println("put key-" + i + ": " + collection.toString()); + collectionIndex.put(("key-" + i).getBytes(), collection); + } + + // extend collections with more values + for (int i = 0; i <= 170; i++) { + collection = new kelondroRowSet(rowdef, 0); + for (int j = 0; j < i; j++) { + collection.addUnique(rowdef.newEntry(new byte[][]{("def" + j).getBytes(), "xxx".getBytes()})); + } + collectionIndex.merge(new indexContainer("key-" + i, collection)); + } + + // printout of index + collectionIndex.close(); + kelondroFlexTable index = new kelondroFlexTable(path, filenameStub + ".index", preloadTime, kelondroCollectionIndex.indexRow(9, kelondroNaturalOrder.naturalOrder), true); + index.print(); + index.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + } +} diff --git a/source/de/anomic/plasma/plasmaRankingCRProcess.java b/source/de/anomic/plasma/plasmaRankingCRProcess.java index 9a54739ab..a8996f36c 100644 --- a/source/de/anomic/plasma/plasmaRankingCRProcess.java +++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java @@ -1,562 +1,562 @@ -// plasmaCRProcess.java -// ----------------------- -// part of YaCy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 -// Created 15.11.2005 -// -// $LastChangedDate: 2005-10-22 15:28:04 +0200 (Sat, 22 Oct 2005) $ -// $LastChangedRevision: 968 $ -// $LastChangedBy: theli $ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -package de.anomic.plasma; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.Map; - -import de.anomic.kelondro.kelondroAttrSeq; -import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroBitfield; -import de.anomic.kelondro.kelondroCollectionIndex; -import de.anomic.kelondro.kelondroFlexTable; -import de.anomic.kelondro.kelondroIndex; -import de.anomic.kelondro.kelondroRow; -import de.anomic.kelondro.kelondroRowSet; -import de.anomic.server.serverDate; -import de.anomic.server.serverFileUtils; -import de.anomic.server.serverMemory; - -public class plasmaRankingCRProcess { - - /* - header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10); - header.append("# Created=" + System.currentTimeMillis()); header.append((char) 13); header.append((char) 10); - header.append("# Structure=,'=',,,,,,,,,,,'|',*"); header.append((char) 13); header.append((char) 10); - header.append("# ---"); header.append((char) 13); header.append((char) 10); - */ - - public static final kelondroRow CRG_accrow = new kelondroRow( - "byte[] Referee-12," + - "Cardinal UDate-3 {b64e}, Cardinal VDate-3 {b64e}, " + - "Cardinal LCount-2 {b64e}, Cardinal GCount-2 {b64e}, Cardinal ICount-2 {b64e}, Cardinal DCount-2 {b64e}, Cardinal TLength-3 {b64e}, " + - "Cardinal WACount-3 {b64e}, Cardinal WUCount-3 {b64e}, Cardinal Flags-1 {b64e}, " + - "Cardinal FUDate-3 {b64e}, Cardinal FDDate-3 {b64e}, Cardinal LUDate-3 {b64e}, " + - "Cardinal UCount-2 {b64e}, Cardinal PCount-2 {b64e}, Cardinal ACount-2 {b64e}, Cardinal VCount-2 {b64e}, Cardinal Vita-2 {b64e}", - kelondroBase64Order.enhancedCoder, 0); - public static final kelondroRow CRG_colrow = new kelondroRow("byte[] Anchor-12", kelondroBase64Order.enhancedCoder, 0); - public static final String CRG_accname = "CRG-a-attr"; - public static final String CRG_seqname = "CRG-a-coli"; - public static final kelondroRow RCI_coli = new kelondroRow("byte[] RefereeDom-6", kelondroBase64Order.enhancedCoder, 0); - public static final String RCI_colname = "RCI-a-coli"; - - private static boolean accumulate_upd(File f, kelondroAttrSeq acc) { - // open file - kelondroAttrSeq source_cr = null; - try { - source_cr = new kelondroAttrSeq(f, false); - } catch (IOException e) { - return false; - } - - // put elements in accumulator file - Iterator el = source_cr.keys(); - String key; - kelondroAttrSeq.Entry new_entry, acc_entry; - int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita; - kelondroBitfield acc_flags, new_flags; - while (el.hasNext()) { - key = (String) el.next(); - new_entry = source_cr.getEntry(key); - new_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) new_entry.getAttr("Flags", 0), 1).getBytes()); - // enrich information with additional values - if ((acc_entry = acc.getEntry(key)) != null) { - FUDate = (int) acc_entry.getAttr("FUDate", 0); - FDDate = (int) acc_entry.getAttr("FDDate", 0); - LUDate = (int) acc_entry.getAttr("LUDate", 0); - UCount = (int) acc_entry.getAttr("UCount", 0); - PCount = (int) acc_entry.getAttr("PCount", 0); - ACount = (int) acc_entry.getAttr("ACount", 0); - VCount = (int) acc_entry.getAttr("VCount", 0); - Vita = (int) acc_entry.getAttr("Vita", 0); - - // update counters and dates - acc_entry.setSeq(new_entry.getSeqSet()); // need to be checked - - UCount++; // increase update counter - PCount += (new_flags.get(1)) ? 1 : 0; - ACount += (new_flags.get(2)) ? 1 : 0; - VCount += (new_flags.get(3)) ? 1 : 0; - - // 'OR' the flags - acc_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) acc_entry.getAttr("Flags", 0), 1).getBytes()); - for (int i = 0; i < 6; i++) { - if (new_flags.get(i)) acc_flags.set(i, true); - } - acc_entry.setAttr("Flags", (int) kelondroBase64Order.enhancedCoder.decodeLong(acc_flags.exportB64())); - } else { - // initialize counters and dates - acc_entry = acc.newEntry(key, new_entry.getAttrs(), new_entry.getSeqSet()); - FUDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // first update date - FDDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack - LUDate = (int) new_entry.getAttr("VDate", 0); - UCount = 0; - PCount = (new_flags.get(1)) ? 1 : 0; - ACount = (new_flags.get(2)) ? 1 : 0; - VCount = (new_flags.get(3)) ? 1 : 0; - Vita = 0; - } - // make plausibility check? - - // insert into accumulator - acc_entry.setAttr("FUDate", (long) FUDate); - acc_entry.setAttr("FDDate", (long) FDDate); - acc_entry.setAttr("LUDate", (long) LUDate); - acc_entry.setAttr("UCount", (long) UCount); - acc_entry.setAttr("PCount", (long) PCount); - acc_entry.setAttr("ACount", (long) ACount); - acc_entry.setAttr("VCount", (long) VCount); - acc_entry.setAttr("Vita", (long) Vita); - acc.putEntrySmall(acc_entry); - } - - return true; - } - - private static boolean accumulate_upd(File f, kelondroIndex acc, kelondroCollectionIndex seq) throws IOException { - // open file - kelondroAttrSeq source_cr = null; - try { - source_cr = new kelondroAttrSeq(f, false); - } catch (IOException e) { - return false; - } - - // put elements in accumulator file - Iterator el = source_cr.keys(); - String key; - kelondroAttrSeq.Entry new_entry; - kelondroRow.Entry acc_entry; - int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita; - kelondroBitfield acc_flags, new_flags; - while (el.hasNext()) { - key = (String) el.next(); - new_entry = source_cr.getEntry(key); - new_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) new_entry.getAttr("Flags", 0), 1).getBytes()); - // enrich information with additional values - if ((acc_entry = acc.get(key.getBytes())) != null) { - FUDate = (int) acc_entry.getColLong("FUDate", 0); - FDDate = (int) acc_entry.getColLong("FDDate", 0); - LUDate = (int) acc_entry.getColLong("LUDate", 0); - UCount = (int) acc_entry.getColLong("UCount", 0); - PCount = (int) acc_entry.getColLong("PCount", 0); - ACount = (int) acc_entry.getColLong("ACount", 0); - VCount = (int) acc_entry.getColLong("VCount", 0); - Vita = (int) acc_entry.getColLong("Vita", 0); - - // update counters and dates - seq.put(key.getBytes(), new_entry.getSeqCollection()); // FIXME: old and new collection must be joined - - UCount++; // increase update counter - PCount += (new_flags.get(1)) ? 1 : 0; - ACount += (new_flags.get(2)) ? 1 : 0; - VCount += (new_flags.get(3)) ? 1 : 0; - - // 'OR' the flags - acc_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong(acc_entry.getColLong("Flags", 0), 1).getBytes()); - for (int i = 0; i < 6; i++) { - if (new_flags.get(i)) acc_flags.set(i, true); - } - acc_entry.setCol("Flags", (int) kelondroBase64Order.enhancedCoder.decodeLong(acc_flags.exportB64())); - } else { - // initialize counters and dates - acc_entry = acc.row().newEntry(); - acc_entry.setCol("Referee", key, null); - for (int i = 1; i < acc.row().columns(); i++) { - acc_entry.setCol(i, new_entry.getAttr(acc.row().column(i).nickname(), 0)); - } - seq.put(key.getBytes(), new_entry.getSeqCollection()); - FUDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // first update date - FDDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack - LUDate = (int) new_entry.getAttr("VDate", 0); - UCount = 0; - PCount = (new_flags.get(1)) ? 1 : 0; - ACount = (new_flags.get(2)) ? 1 : 0; - VCount = (new_flags.get(3)) ? 1 : 0; - Vita = 0; - } - // make plausibility check? - - // insert into accumulator - acc_entry.setCol("FUDate", (long) FUDate); - acc_entry.setCol("FDDate", (long) FDDate); - acc_entry.setCol("LUDate", (long) LUDate); - acc_entry.setCol("UCount", (long) UCount); - acc_entry.setCol("PCount", (long) PCount); - acc_entry.setCol("ACount", (long) ACount); - acc_entry.setCol("VCount", (long) VCount); - acc_entry.setCol("Vita", (long) Vita); - acc.put(acc_entry); - } - - return true; - } - - public static void accumulate(File from_dir, File tmp_dir, File err_dir, File bkp_dir, File to_file, int max_files, boolean newdb) throws IOException { - if (!(from_dir.isDirectory())) { - System.out.println("source path " + from_dir + " is not a directory."); - return; - } - if (!(tmp_dir.isDirectory())) { - System.out.println("temporary path " + tmp_dir + " is not a directory."); - return; - } - if (!(err_dir.isDirectory())) { - System.out.println("error path " + err_dir + " is not a directory."); - return; - } - if (!(bkp_dir.isDirectory())) { - System.out.println("back-up path " + bkp_dir + " is not a directory."); - return; - } - - // open target file - kelondroAttrSeq acc = null; - kelondroIndex newacc = null; - kelondroCollectionIndex newseq = null; - if (newdb) { - File path = to_file.getParentFile(); // path to storage place - newacc = new kelondroFlexTable(path, CRG_accname, -1, CRG_accrow, false); - newseq = new kelondroCollectionIndex(path, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, -1, 2, 9, CRG_colrow); - } else { - if (!(to_file.exists())) { - acc = new kelondroAttrSeq("Global Ranking Accumulator File", - ",'='," + - ",,,,,,,,,," + - ",,,,,,,," + - "'|',*", false); - acc.toFile(to_file); - } - acc = new kelondroAttrSeq(to_file, false); - } - // collect source files - File source_file = null; - String[] files = from_dir.list(); - if (files.length < max_files) max_files = files.length; - for (int i = 0; i < max_files; i++) { - // open file - source_file = new File(from_dir, files[i]); - if (newdb) { - if (accumulate_upd(source_file, newacc, newseq)) { - // move cr file to temporary folder - source_file.renameTo(new File(tmp_dir, files[i])); - } else { - // error case: the cr-file is not valid; move to error path - source_file.renameTo(new File(err_dir, files[i])); - } - } else { - if (accumulate_upd(source_file, acc)) { - // move cr file to temporary folder - source_file.renameTo(new File(tmp_dir, files[i])); - } else { - // error case: the cr-file is not valid; move to error path - source_file.renameTo(new File(err_dir, files[i])); - } - } - } - - try { - if (newdb) { - newacc.close(); - newseq.close(); - } else { - // save accumulator to temporary file - File tmp_file; - if (to_file.toString().endsWith(".gz")) { - tmp_file = new File(to_file.toString() + "." + (System.currentTimeMillis() % 1000) + ".tmp.gz"); - } else { - tmp_file = new File(to_file.toString() + "." + (System.currentTimeMillis() % 1000) + ".tmp"); - } - // store the file - acc.toFile(tmp_file); - // since this was successful, we remove the old file and move the new file to it - to_file.delete(); - tmp_file.renameTo(to_file); - } - serverFileUtils.moveAll(tmp_dir, bkp_dir); - } catch (IOException e) { - // move previously processed files back - e.printStackTrace(); - serverFileUtils.moveAll(tmp_dir, from_dir); - } - - } - - public static int genrci(File cr_in, File rci_out) throws IOException { - if (!(cr_in.exists())) return 0; - kelondroAttrSeq cr = new kelondroAttrSeq(cr_in, false); - //if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing) - if (!(rci_out.exists())) { - kelondroAttrSeq rcix = new kelondroAttrSeq("Global Ranking Reverse Citation Index", - ",'='," + - "," + - "'|',*", false); - rcix.toFile(rci_out); - } - final kelondroAttrSeq rci = new kelondroAttrSeq(rci_out, false); - - // loop over all referees - int count = 0; - int size = cr.size(); - long start = System.currentTimeMillis(); - long l; - final Iterator i = cr.keys(); - String referee, anchor, anchorDom; - kelondroAttrSeq.Entry cr_entry, rci_entry; - long cr_UDate, rci_UDate; - while (i.hasNext()) { - referee = (String) i.next(); - cr_entry = cr.getEntry(referee); - cr_UDate = cr_entry.getAttr("UDate", 0); - - // loop over all anchors - Iterator j = cr_entry.getSeqSet().iterator(); - Map.Entry entry; - while (j.hasNext()) { - // get domain of anchors - entry = (Map.Entry) j.next(); - anchor = (String) entry.getKey(); - if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6); - - // update domain-specific entry - rci_entry = rci.getEntry(anchorDom); - if (rci_entry == null) rci_entry = rci.newEntry(anchorDom, false); - rci_entry.addSeq(referee); - - // update Update-Date - rci_UDate = rci_entry.getAttr("UDate", 0); - if (cr_UDate > rci_UDate) rci_entry.setAttr("UDate", cr_UDate); - - // insert entry - rci.putEntry(rci_entry); - } - count++; - if ((count % 1000) == 0) { - l = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("processed " + count + " citations, " + (count / l) + " per second, rci.size = " + rci.size() + ", " + ((size - count) / (count / l)) + " seconds remaining; mem = " + serverMemory.available()); - } - i.remove(); - } - - // finished. write to file - cr = null; - cr_in = null; - System.gc(); - rci.toFile(rci_out); - return count; - } - - public static int genrcix(File cr_path_in, File rci_path_out) throws IOException { - //kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true); - kelondroCollectionIndex seq = new kelondroCollectionIndex(cr_path_in, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, -1, 2, 9, CRG_colrow); - kelondroCollectionIndex rci = new kelondroCollectionIndex(rci_path_out, RCI_colname, 6, kelondroBase64Order.enhancedCoder, -1, 2, 9, RCI_coli); - - // loop over all referees - int count = 0; - int size = seq.size(); - long start = System.currentTimeMillis(); - long l; - final Iterator i = seq.keycollections(null, null, false); - Object[] keycollection; - String referee, refereeDom, anchor, anchorDom; - kelondroRowSet cr_entry, rci_entry; - while (i.hasNext()) { - keycollection = (Object[]) i.next(); - referee = new String((byte[]) keycollection[0]); - if (referee.length() == 6) refereeDom = referee; else refereeDom = referee.substring(6); - cr_entry = (kelondroRowSet) keycollection[1]; - - // loop over all anchors - Iterator j = cr_entry.rows(); - kelondroRow.Entry entry; - while (j.hasNext()) { - // get domain of anchors - entry = (kelondroRow.Entry) j.next(); - anchor = (String) entry.getColString(0, null); - if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6); - - // update domain-specific entry - rci_entry = rci.get(anchorDom.getBytes()); - if (rci_entry == null) rci_entry = new kelondroRowSet(RCI_coli, 0); - rci_entry.add(refereeDom.getBytes()); - - // insert entry - rci.put(anchorDom.getBytes(), rci_entry); - } - count++; - if ((count % 1000) == 0) { - l = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("processed " + count + " citations, " + (count / l) + " per second, rci.size = " + rci.size() + ", " + ((size - count) / (count / l) / 60) + " minutes remaining; mem = " + Runtime.getRuntime().freeMemory()); - } - } - - // finished. write to file - seq.close(); - rci.close(); - return count; - } - - public static void main(String[] args) { - // java -classpath source de.anomic.plasma.kelondroPropFile -transcode DATA/RANKING/GLOBAL/CRG-test-unsorted-original.cr DATA/RANKING/GLOBAL/CRG-test-generated.cr - try { - if ((args.length == 5) && (args[0].equals("-accumulate"))) { - accumulate(new File(args[1]), new File(args[2]), new File(args[3]), new File(args[4]), new File(args[5]), Integer.parseInt(args[6]), true); - } - if ((args.length == 2) && (args[0].equals("-accumulate"))) { - File root_path = new File(args[1]); - File from_dir = new File(root_path, "DATA/RANKING/GLOBAL/014_othercr"); - File ready_dir = new File(root_path, "DATA/RANKING/GLOBAL/015_ready"); - File tmp_dir = new File(root_path, "DATA/RANKING/GLOBAL/016_tmp"); - File err_dir = new File(root_path, "DATA/RANKING/GLOBAL/017_err"); - File acc_dir = new File(root_path, "DATA/RANKING/GLOBAL/018_acc"); - String filename = "CRG-a-" + new serverDate().toShortString(true) + ".cr.gz"; - File to_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/" + filename); - if (!(ready_dir.exists())) ready_dir.mkdirs(); - if (!(tmp_dir.exists())) tmp_dir.mkdirs(); - if (!(err_dir.exists())) err_dir.mkdirs(); - if (!(acc_dir.exists())) acc_dir.mkdirs(); - if (!(to_file.getParentFile().exists())) to_file.getParentFile().mkdirs(); - serverFileUtils.moveAll(from_dir, ready_dir); - long start = System.currentTimeMillis(); - int files = ready_dir.list().length; - accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file, 1000, true); - long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("Finished accumulate for " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)"); - } - if ((args.length == 3) && (args[0].equals("-recycle"))) { - File root_path = new File(args[1]); - int max_age_hours = Integer.parseInt(args[2]); - File own_dir = new File(root_path, "DATA/RANKING/GLOBAL/010_owncr"); - File acc_dir = new File(root_path, "DATA/RANKING/GLOBAL/018_acc"); - File bkp_dir = new File(root_path, "DATA/RANKING/GLOBAL/019_bkp"); - if (!(own_dir.exists())) return; - if (!(acc_dir.exists())) return; - if (!(bkp_dir.exists())) bkp_dir.mkdirs(); - String[] list = acc_dir.list(); - long start = System.currentTimeMillis(); - int files = list.length; - long d; - File f; - for (int i = 0; i < list.length; i++) { - f = new File(acc_dir, list[i]); - try { - d = (System.currentTimeMillis() - (new kelondroAttrSeq(f, false)).created()) / 3600000; - if (d > max_age_hours) { - // file is considered to be too old, it is not recycled - System.out.println("file " + f.getName() + " is old (" + d + " hours) and not recycled, only moved to backup"); - f.renameTo(new File(bkp_dir, list[i])); - } else { - // file is fresh, it is duplicated and moved to be transferred to other peers again - System.out.println("file " + f.getName() + " is fresh (" + d + " hours old), recycled and moved to backup"); - serverFileUtils.copy(f, new File(own_dir, list[i])); - f.renameTo(new File(bkp_dir, list[i])); - } - } catch (IOException e) { - // there is something wrong with this file; delete it - System.out.println("file " + f.getName() + " is corrupted and deleted"); - f.delete(); - } - } - long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("Finished recycling of " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)"); - } - if ((args.length == 2) && (args[0].equals("-genrci"))) { - File root_path = new File(args[1]); - File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0"); - File rci_filedir = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0"); - rci_filedir.mkdirs(); - long start = System.currentTimeMillis(); - int count = genrcix(cr_filedir, rci_filedir); - long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("Completed RCI generation: " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)"); - } - /* - if ((args.length == 2) && (args[0].equals("-genrci"))) { - File root_path = new File(args[1]); - File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0"); - File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz"); - rci_file.getParentFile().mkdirs(); - String[] cr_filenames = cr_filedir.list(); - for (int i = 0; i < cr_filenames.length; i++) { - long start = System.currentTimeMillis(); - int count = genrci(new File(cr_filedir, cr_filenames[i]), rci_file); - long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("Completed RCI generation for input file " + cr_filenames[i] + ": " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)"); - } - } - */ - } catch (IOException e) { - e.printStackTrace(); - } - } - - /* - Class-A File format: - - UDate : latest update timestamp of the URL (as virtual date, hours since epoch) - VDate : last visit timestamp of the URL (as virtual date, hours since epoch) - LCount : count of links to local resources - GCount : count of links to global resources - ICount : count of links to images (in document) - DCount : count of links to other documents - TLength: length of the plain text content (bytes) - WACount: total number of all words in content - WUCount: number of unique words in content (removed doubles) - Flags : Flags (0=update, 1=popularity, 2=attention, 3=vote) - - Class-a File format is an extension of Class-A plus the following attributes - FUDate : first update timestamp of the URL - FDDate : first update timestamp of the domain - LUDate : latest update timestamp of the URL - UCount : Update Counter (of 'latest update timestamp') - PCount : Popularity Counter (proxy clicks) - ACount : Attention Counter (search result clicks) - VCount : Votes - Vita : Vitality (normed number of updates per time) - */ -} +// plasmaCRProcess.java +// ----------------------- +// part of YaCy +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// Created 15.11.2005 +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.plasma; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; + +import de.anomic.kelondro.kelondroAttrSeq; +import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroBitfield; +import de.anomic.kelondro.kelondroCollectionIndex; +import de.anomic.kelondro.kelondroFlexTable; +import de.anomic.kelondro.kelondroIndex; +import de.anomic.kelondro.kelondroRow; +import de.anomic.kelondro.kelondroRowSet; +import de.anomic.server.serverDate; +import de.anomic.server.serverFileUtils; +import de.anomic.server.serverMemory; + +public class plasmaRankingCRProcess { + + /* + header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10); + header.append("# Created=" + System.currentTimeMillis()); header.append((char) 13); header.append((char) 10); + header.append("# Structure=,'=',,,,,,,,,,,'|',*"); header.append((char) 13); header.append((char) 10); + header.append("# ---"); header.append((char) 13); header.append((char) 10); + */ + + public static final kelondroRow CRG_accrow = new kelondroRow( + "byte[] Referee-12," + + "Cardinal UDate-3 {b64e}, Cardinal VDate-3 {b64e}, " + + "Cardinal LCount-2 {b64e}, Cardinal GCount-2 {b64e}, Cardinal ICount-2 {b64e}, Cardinal DCount-2 {b64e}, Cardinal TLength-3 {b64e}, " + + "Cardinal WACount-3 {b64e}, Cardinal WUCount-3 {b64e}, Cardinal Flags-1 {b64e}, " + + "Cardinal FUDate-3 {b64e}, Cardinal FDDate-3 {b64e}, Cardinal LUDate-3 {b64e}, " + + "Cardinal UCount-2 {b64e}, Cardinal PCount-2 {b64e}, Cardinal ACount-2 {b64e}, Cardinal VCount-2 {b64e}, Cardinal Vita-2 {b64e}", + kelondroBase64Order.enhancedCoder, 0); + public static final kelondroRow CRG_colrow = new kelondroRow("byte[] Anchor-12", kelondroBase64Order.enhancedCoder, 0); + public static final String CRG_accname = "CRG-a-attr"; + public static final String CRG_seqname = "CRG-a-coli"; + public static final kelondroRow RCI_coli = new kelondroRow("byte[] RefereeDom-6", kelondroBase64Order.enhancedCoder, 0); + public static final String RCI_colname = "RCI-a-coli"; + + private static boolean accumulate_upd(File f, kelondroAttrSeq acc) { + // open file + kelondroAttrSeq source_cr = null; + try { + source_cr = new kelondroAttrSeq(f, false); + } catch (IOException e) { + return false; + } + + // put elements in accumulator file + Iterator el = source_cr.keys(); + String key; + kelondroAttrSeq.Entry new_entry, acc_entry; + int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita; + kelondroBitfield acc_flags, new_flags; + while (el.hasNext()) { + key = (String) el.next(); + new_entry = source_cr.getEntry(key); + new_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) new_entry.getAttr("Flags", 0), 1).getBytes()); + // enrich information with additional values + if ((acc_entry = acc.getEntry(key)) != null) { + FUDate = (int) acc_entry.getAttr("FUDate", 0); + FDDate = (int) acc_entry.getAttr("FDDate", 0); + LUDate = (int) acc_entry.getAttr("LUDate", 0); + UCount = (int) acc_entry.getAttr("UCount", 0); + PCount = (int) acc_entry.getAttr("PCount", 0); + ACount = (int) acc_entry.getAttr("ACount", 0); + VCount = (int) acc_entry.getAttr("VCount", 0); + Vita = (int) acc_entry.getAttr("Vita", 0); + + // update counters and dates + acc_entry.setSeq(new_entry.getSeqSet()); // need to be checked + + UCount++; // increase update counter + PCount += (new_flags.get(1)) ? 1 : 0; + ACount += (new_flags.get(2)) ? 1 : 0; + VCount += (new_flags.get(3)) ? 1 : 0; + + // 'OR' the flags + acc_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) acc_entry.getAttr("Flags", 0), 1).getBytes()); + for (int i = 0; i < 6; i++) { + if (new_flags.get(i)) acc_flags.set(i, true); + } + acc_entry.setAttr("Flags", (int) kelondroBase64Order.enhancedCoder.decodeLong(acc_flags.exportB64())); + } else { + // initialize counters and dates + acc_entry = acc.newEntry(key, new_entry.getAttrs(), new_entry.getSeqSet()); + FUDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // first update date + FDDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack + LUDate = (int) new_entry.getAttr("VDate", 0); + UCount = 0; + PCount = (new_flags.get(1)) ? 1 : 0; + ACount = (new_flags.get(2)) ? 1 : 0; + VCount = (new_flags.get(3)) ? 1 : 0; + Vita = 0; + } + // make plausibility check? + + // insert into accumulator + acc_entry.setAttr("FUDate", (long) FUDate); + acc_entry.setAttr("FDDate", (long) FDDate); + acc_entry.setAttr("LUDate", (long) LUDate); + acc_entry.setAttr("UCount", (long) UCount); + acc_entry.setAttr("PCount", (long) PCount); + acc_entry.setAttr("ACount", (long) ACount); + acc_entry.setAttr("VCount", (long) VCount); + acc_entry.setAttr("Vita", (long) Vita); + acc.putEntrySmall(acc_entry); + } + + return true; + } + + private static boolean accumulate_upd(File f, kelondroIndex acc, kelondroCollectionIndex seq) throws IOException { + // open file + kelondroAttrSeq source_cr = null; + try { + source_cr = new kelondroAttrSeq(f, false); + } catch (IOException e) { + return false; + } + + // put elements in accumulator file + Iterator el = source_cr.keys(); + String key; + kelondroAttrSeq.Entry new_entry; + kelondroRow.Entry acc_entry; + int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita; + kelondroBitfield acc_flags, new_flags; + while (el.hasNext()) { + key = (String) el.next(); + new_entry = source_cr.getEntry(key); + new_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) new_entry.getAttr("Flags", 0), 1).getBytes()); + // enrich information with additional values + if ((acc_entry = acc.get(key.getBytes())) != null) { + FUDate = (int) acc_entry.getColLong("FUDate", 0); + FDDate = (int) acc_entry.getColLong("FDDate", 0); + LUDate = (int) acc_entry.getColLong("LUDate", 0); + UCount = (int) acc_entry.getColLong("UCount", 0); + PCount = (int) acc_entry.getColLong("PCount", 0); + ACount = (int) acc_entry.getColLong("ACount", 0); + VCount = (int) acc_entry.getColLong("VCount", 0); + Vita = (int) acc_entry.getColLong("Vita", 0); + + // update counters and dates + seq.put(key.getBytes(), new_entry.getSeqCollection()); // FIXME: old and new collection must be joined + + UCount++; // increase update counter + PCount += (new_flags.get(1)) ? 1 : 0; + ACount += (new_flags.get(2)) ? 1 : 0; + VCount += (new_flags.get(3)) ? 1 : 0; + + // 'OR' the flags + acc_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong(acc_entry.getColLong("Flags", 0), 1).getBytes()); + for (int i = 0; i < 6; i++) { + if (new_flags.get(i)) acc_flags.set(i, true); + } + acc_entry.setCol("Flags", (int) kelondroBase64Order.enhancedCoder.decodeLong(acc_flags.exportB64())); + } else { + // initialize counters and dates + acc_entry = acc.row().newEntry(); + acc_entry.setCol("Referee", key, null); + for (int i = 1; i < acc.row().columns(); i++) { + acc_entry.setCol(i, new_entry.getAttr(acc.row().column(i).nickname(), 0)); + } + seq.put(key.getBytes(), new_entry.getSeqCollection()); + FUDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // first update date + FDDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack + LUDate = (int) new_entry.getAttr("VDate", 0); + UCount = 0; + PCount = (new_flags.get(1)) ? 1 : 0; + ACount = (new_flags.get(2)) ? 1 : 0; + VCount = (new_flags.get(3)) ? 1 : 0; + Vita = 0; + } + // make plausibility check? + + // insert into accumulator + acc_entry.setCol("FUDate", (long) FUDate); + acc_entry.setCol("FDDate", (long) FDDate); + acc_entry.setCol("LUDate", (long) LUDate); + acc_entry.setCol("UCount", (long) UCount); + acc_entry.setCol("PCount", (long) PCount); + acc_entry.setCol("ACount", (long) ACount); + acc_entry.setCol("VCount", (long) VCount); + acc_entry.setCol("Vita", (long) Vita); + acc.put(acc_entry); + } + + return true; + } + + public static void accumulate(File from_dir, File tmp_dir, File err_dir, File bkp_dir, File to_file, int max_files, boolean newdb) throws IOException { + if (!(from_dir.isDirectory())) { + System.out.println("source path " + from_dir + " is not a directory."); + return; + } + if (!(tmp_dir.isDirectory())) { + System.out.println("temporary path " + tmp_dir + " is not a directory."); + return; + } + if (!(err_dir.isDirectory())) { + System.out.println("error path " + err_dir + " is not a directory."); + return; + } + if (!(bkp_dir.isDirectory())) { + System.out.println("back-up path " + bkp_dir + " is not a directory."); + return; + } + + // open target file + kelondroAttrSeq acc = null; + kelondroIndex newacc = null; + kelondroCollectionIndex newseq = null; + if (newdb) { + File path = to_file.getParentFile(); // path to storage place + newacc = new kelondroFlexTable(path, CRG_accname, -1, CRG_accrow, false); + newseq = new kelondroCollectionIndex(path, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, -1, 2, 9, CRG_colrow); + } else { + if (!(to_file.exists())) { + acc = new kelondroAttrSeq("Global Ranking Accumulator File", + ",'='," + + ",,,,,,,,,," + + ",,,,,,,," + + "'|',*", false); + acc.toFile(to_file); + } + acc = new kelondroAttrSeq(to_file, false); + } + // collect source files + File source_file = null; + String[] files = from_dir.list(); + if (files.length < max_files) max_files = files.length; + for (int i = 0; i < max_files; i++) { + // open file + source_file = new File(from_dir, files[i]); + if (newdb) { + if (accumulate_upd(source_file, newacc, newseq)) { + // move cr file to temporary folder + source_file.renameTo(new File(tmp_dir, files[i])); + } else { + // error case: the cr-file is not valid; move to error path + source_file.renameTo(new File(err_dir, files[i])); + } + } else { + if (accumulate_upd(source_file, acc)) { + // move cr file to temporary folder + source_file.renameTo(new File(tmp_dir, files[i])); + } else { + // error case: the cr-file is not valid; move to error path + source_file.renameTo(new File(err_dir, files[i])); + } + } + } + + try { + if (newdb) { + newacc.close(); + newseq.close(); + } else { + // save accumulator to temporary file + File tmp_file; + if (to_file.toString().endsWith(".gz")) { + tmp_file = new File(to_file.toString() + "." + (System.currentTimeMillis() % 1000) + ".tmp.gz"); + } else { + tmp_file = new File(to_file.toString() + "." + (System.currentTimeMillis() % 1000) + ".tmp"); + } + // store the file + acc.toFile(tmp_file); + // since this was successful, we remove the old file and move the new file to it + to_file.delete(); + tmp_file.renameTo(to_file); + } + serverFileUtils.moveAll(tmp_dir, bkp_dir); + } catch (IOException e) { + // move previously processed files back + e.printStackTrace(); + serverFileUtils.moveAll(tmp_dir, from_dir); + } + + } + + public static int genrci(File cr_in, File rci_out) throws IOException { + if (!(cr_in.exists())) return 0; + kelondroAttrSeq cr = new kelondroAttrSeq(cr_in, false); + //if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing) + if (!(rci_out.exists())) { + kelondroAttrSeq rcix = new kelondroAttrSeq("Global Ranking Reverse Citation Index", + ",'='," + + "," + + "'|',*", false); + rcix.toFile(rci_out); + } + final kelondroAttrSeq rci = new kelondroAttrSeq(rci_out, false); + + // loop over all referees + int count = 0; + int size = cr.size(); + long start = System.currentTimeMillis(); + long l; + final Iterator i = cr.keys(); + String referee, anchor, anchorDom; + kelondroAttrSeq.Entry cr_entry, rci_entry; + long cr_UDate, rci_UDate; + while (i.hasNext()) { + referee = (String) i.next(); + cr_entry = cr.getEntry(referee); + cr_UDate = cr_entry.getAttr("UDate", 0); + + // loop over all anchors + Iterator j = cr_entry.getSeqSet().iterator(); + Map.Entry entry; + while (j.hasNext()) { + // get domain of anchors + entry = (Map.Entry) j.next(); + anchor = (String) entry.getKey(); + if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6); + + // update domain-specific entry + rci_entry = rci.getEntry(anchorDom); + if (rci_entry == null) rci_entry = rci.newEntry(anchorDom, false); + rci_entry.addSeq(referee); + + // update Update-Date + rci_UDate = rci_entry.getAttr("UDate", 0); + if (cr_UDate > rci_UDate) rci_entry.setAttr("UDate", cr_UDate); + + // insert entry + rci.putEntry(rci_entry); + } + count++; + if ((count % 1000) == 0) { + l = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); + System.out.println("processed " + count + " citations, " + (count / l) + " per second, rci.size = " + rci.size() + ", " + ((size - count) / (count / l)) + " seconds remaining; mem = " + serverMemory.available()); + } + i.remove(); + } + + // finished. write to file + cr = null; + cr_in = null; + serverMemory.gc(1000, "plasmaRankingCRProcess.genrci(...)"); // thq + rci.toFile(rci_out); + return count; + } + + public static int genrcix(File cr_path_in, File rci_path_out) throws IOException { + //kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true); + kelondroCollectionIndex seq = new kelondroCollectionIndex(cr_path_in, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, -1, 2, 9, CRG_colrow); + kelondroCollectionIndex rci = new kelondroCollectionIndex(rci_path_out, RCI_colname, 6, kelondroBase64Order.enhancedCoder, -1, 2, 9, RCI_coli); + + // loop over all referees + int count = 0; + int size = seq.size(); + long start = System.currentTimeMillis(); + long l; + final Iterator i = seq.keycollections(null, null, false); + Object[] keycollection; + String referee, refereeDom, anchor, anchorDom; + kelondroRowSet cr_entry, rci_entry; + while (i.hasNext()) { + keycollection = (Object[]) i.next(); + referee = new String((byte[]) keycollection[0]); + if (referee.length() == 6) refereeDom = referee; else refereeDom = referee.substring(6); + cr_entry = (kelondroRowSet) keycollection[1]; + + // loop over all anchors + Iterator j = cr_entry.rows(); + kelondroRow.Entry entry; + while (j.hasNext()) { + // get domain of anchors + entry = (kelondroRow.Entry) j.next(); + anchor = (String) entry.getColString(0, null); + if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6); + + // update domain-specific entry + rci_entry = rci.get(anchorDom.getBytes()); + if (rci_entry == null) rci_entry = new kelondroRowSet(RCI_coli, 0); + rci_entry.add(refereeDom.getBytes()); + + // insert entry + rci.put(anchorDom.getBytes(), rci_entry); + } + count++; + if ((count % 1000) == 0) { + l = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); + System.out.println("processed " + count + " citations, " + (count / l) + " per second, rci.size = " + rci.size() + ", " + ((size - count) / (count / l) / 60) + " minutes remaining; mem = " + Runtime.getRuntime().freeMemory()); + } + } + + // finished. write to file + seq.close(); + rci.close(); + return count; + } + + public static void main(String[] args) { + // java -classpath source de.anomic.plasma.kelondroPropFile -transcode DATA/RANKING/GLOBAL/CRG-test-unsorted-original.cr DATA/RANKING/GLOBAL/CRG-test-generated.cr + try { + if ((args.length == 5) && (args[0].equals("-accumulate"))) { + accumulate(new File(args[1]), new File(args[2]), new File(args[3]), new File(args[4]), new File(args[5]), Integer.parseInt(args[6]), true); + } + if ((args.length == 2) && (args[0].equals("-accumulate"))) { + File root_path = new File(args[1]); + File from_dir = new File(root_path, "DATA/RANKING/GLOBAL/014_othercr"); + File ready_dir = new File(root_path, "DATA/RANKING/GLOBAL/015_ready"); + File tmp_dir = new File(root_path, "DATA/RANKING/GLOBAL/016_tmp"); + File err_dir = new File(root_path, "DATA/RANKING/GLOBAL/017_err"); + File acc_dir = new File(root_path, "DATA/RANKING/GLOBAL/018_acc"); + String filename = "CRG-a-" + new serverDate().toShortString(true) + ".cr.gz"; + File to_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/" + filename); + if (!(ready_dir.exists())) ready_dir.mkdirs(); + if (!(tmp_dir.exists())) tmp_dir.mkdirs(); + if (!(err_dir.exists())) err_dir.mkdirs(); + if (!(acc_dir.exists())) acc_dir.mkdirs(); + if (!(to_file.getParentFile().exists())) to_file.getParentFile().mkdirs(); + serverFileUtils.moveAll(from_dir, ready_dir); + long start = System.currentTimeMillis(); + int files = ready_dir.list().length; + accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file, 1000, true); + long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); + System.out.println("Finished accumulate for " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)"); + } + if ((args.length == 3) && (args[0].equals("-recycle"))) { + File root_path = new File(args[1]); + int max_age_hours = Integer.parseInt(args[2]); + File own_dir = new File(root_path, "DATA/RANKING/GLOBAL/010_owncr"); + File acc_dir = new File(root_path, "DATA/RANKING/GLOBAL/018_acc"); + File bkp_dir = new File(root_path, "DATA/RANKING/GLOBAL/019_bkp"); + if (!(own_dir.exists())) return; + if (!(acc_dir.exists())) return; + if (!(bkp_dir.exists())) bkp_dir.mkdirs(); + String[] list = acc_dir.list(); + long start = System.currentTimeMillis(); + int files = list.length; + long d; + File f; + for (int i = 0; i < list.length; i++) { + f = new File(acc_dir, list[i]); + try { + d = (System.currentTimeMillis() - (new kelondroAttrSeq(f, false)).created()) / 3600000; + if (d > max_age_hours) { + // file is considered to be too old, it is not recycled + System.out.println("file " + f.getName() + " is old (" + d + " hours) and not recycled, only moved to backup"); + f.renameTo(new File(bkp_dir, list[i])); + } else { + // file is fresh, it is duplicated and moved to be transferred to other peers again + System.out.println("file " + f.getName() + " is fresh (" + d + " hours old), recycled and moved to backup"); + serverFileUtils.copy(f, new File(own_dir, list[i])); + f.renameTo(new File(bkp_dir, list[i])); + } + } catch (IOException e) { + // there is something wrong with this file; delete it + System.out.println("file " + f.getName() + " is corrupted and deleted"); + f.delete(); + } + } + long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); + System.out.println("Finished recycling of " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)"); + } + if ((args.length == 2) && (args[0].equals("-genrci"))) { + File root_path = new File(args[1]); + File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0"); + File rci_filedir = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0"); + rci_filedir.mkdirs(); + long start = System.currentTimeMillis(); + int count = genrcix(cr_filedir, rci_filedir); + long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); + System.out.println("Completed RCI generation: " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)"); + } + /* + if ((args.length == 2) && (args[0].equals("-genrci"))) { + File root_path = new File(args[1]); + File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0"); + File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz"); + rci_file.getParentFile().mkdirs(); + String[] cr_filenames = cr_filedir.list(); + for (int i = 0; i < cr_filenames.length; i++) { + long start = System.currentTimeMillis(); + int count = genrci(new File(cr_filedir, cr_filenames[i]), rci_file); + long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); + System.out.println("Completed RCI generation for input file " + cr_filenames[i] + ": " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)"); + } + } + */ + } catch (IOException e) { + e.printStackTrace(); + } + } + + /* + Class-A File format: + + UDate : latest update timestamp of the URL (as virtual date, hours since epoch) + VDate : last visit timestamp of the URL (as virtual date, hours since epoch) + LCount : count of links to local resources + GCount : count of links to global resources + ICount : count of links to images (in document) + DCount : count of links to other documents + TLength: length of the plain text content (bytes) + WACount: total number of all words in content + WUCount: number of unique words in content (removed doubles) + Flags : Flags (0=update, 1=popularity, 2=attention, 3=vote) + + Class-a File format is an extension of Class-A plus the following attributes + FUDate : first update timestamp of the URL + FDDate : first update timestamp of the domain + LUDate : latest update timestamp of the URL + UCount : Update Counter (of 'latest update timestamp') + PCount : Popularity Counter (proxy clicks) + ACount : Attention Counter (search result clicks) + VCount : Votes + Vita : Vitality (normed number of updates per time) + */ +} diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index e90266b61..dea91a33e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -121,6 +121,8 @@ import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.Set; +import java.util.Timer; +import java.util.TimerTask; import java.util.TreeMap; import java.util.TreeSet; @@ -159,6 +161,7 @@ import de.anomic.server.serverAbstractSwitch; import de.anomic.server.serverDomains; import de.anomic.server.serverFileUtils; import de.anomic.server.serverInstantThread; +import de.anomic.server.serverMemory; import de.anomic.server.serverObjects; import de.anomic.server.serverSemaphore; import de.anomic.server.serverSwitch; @@ -253,7 +256,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public TreeMap clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used public boolean acceptLocalURLs, acceptGlobalURLs; public URLLicense licensedURLs; - + public Timer moreMemory; + /* * Remote Proxy configuration */ @@ -1153,11 +1157,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser wordIndex.setWordFlushSize((int) getConfigLong("wordFlushSize", 10000)); // set a maximum amount of memory for the caches - long memprereq = Math.max(getConfigLong(INDEXER_MEMPREREQ, 0), wordIndex.minMem()); + // long memprereq = Math.max(getConfigLong(INDEXER_MEMPREREQ, 0), wordIndex.minMem()); // setConfig(INDEXER_MEMPREREQ, memprereq); - //setThreadPerformance(INDEXER, getConfigLong(INDEXER_IDLESLEEP, 0), getConfigLong(INDEXER_BUSYSLEEP, 0), memprereq); - kelondroCachedRecords.setCacheGrowStati(memprereq + 4 * 1024 * 1024, memprereq + 2 * 1024 * 1024); - kelondroCache.setCacheGrowStati(memprereq + 4 * 1024 * 1024, memprereq + 2 * 1024 * 1024); + // setThreadPerformance(INDEXER, getConfigLong(INDEXER_IDLESLEEP, 0), getConfigLong(INDEXER_BUSYSLEEP, 0), memprereq); + kelondroCachedRecords.setCacheGrowStati(40 * 1024 * 1024, 20 * 1024 * 1024); + kelondroCache.setCacheGrowStati(40 * 1024 * 1024, 20 * 1024 * 1024); // make parser log.logConfig("Starting Parser"); @@ -1318,7 +1322,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // deploy threads log.logConfig("Starting Threads"); - // System.gc(); // help for profiler + serverMemory.gc(1000, "plasmaSwitchboard, help for profiler"); // help for profiler - thq + + moreMemory = new Timer(); // init GC Thread - thq + moreMemory.schedule(new MoreMemory(), 300000, 600000); + int indexing_cluster = Integer.parseInt(getConfig(INDEXER_CLUSTER, "1")); if (indexing_cluster < 1) indexing_cluster = 1; deployThread(CLEANUP, "Cleanup", "simple cleaning process for monitoring information", null, @@ -1743,6 +1751,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public void close() { log.logConfig("SWITCHBOARD SHUTDOWN STEP 1: sending termination signal to managed threads:"); + moreMemory.cancel(); terminateAllThreads(true); if (transferIdxThread != null) stopTransferWholeIndex(false); log.logConfig("SWITCHBOARD SHUTDOWN STEP 2: sending termination signal to threaded indexing"); @@ -1792,9 +1801,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // flush some entries from the RAM cache wordIndex.flushCacheSome(); // adopt maximum cache size to current size to prevent that further OutOfMemoryErrors occur - int newMaxCount = Math.max(1200, Math.min((int) getConfigLong(WORDCACHE_MAX_COUNT, 1200), wordIndex.dhtOutCacheSize())); +/* int newMaxCount = Math.max(1200, Math.min((int) getConfigLong(WORDCACHE_MAX_COUNT, 1200), wordIndex.dhtOutCacheSize())); setConfig(WORDCACHE_MAX_COUNT, Integer.toString(newMaxCount)); - wordIndex.setMaxWordCount(newMaxCount); + wordIndex.setMaxWordCount(newMaxCount); */ } public boolean deQueue() { @@ -2040,14 +2049,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser yacyCore.newsPool.publishMyNews(yacyNewsRecord.newRecord(yacyNewsPool.CATEGORY_PROFILE_BROADCAST, news)); } } - +/* // set a maximum amount of memory for the caches - long memprereq = Math.max(getConfigLong(INDEXER_MEMPREREQ, 0), wordIndex.minMem()); + // long memprereq = Math.max(getConfigLong(INDEXER_MEMPREREQ, 0), wordIndex.minMem()); // setConfig(INDEXER_MEMPREREQ, memprereq); - //setThreadPerformance(INDEXER, getConfigLong(INDEXER_IDLESLEEP, 0), getConfigLong(INDEXER_BUSYSLEEP, 0), memprereq); - kelondroCachedRecords.setCacheGrowStati(memprereq + 4 * 1024 * 1024, memprereq + 2 * 1024 * 1024); - kelondroCache.setCacheGrowStati(memprereq + 4 * 1024 * 1024, memprereq + 2 * 1024 * 1024); - + // setThreadPerformance(INDEXER, getConfigLong(INDEXER_IDLESLEEP, 0), getConfigLong(INDEXER_BUSYSLEEP, 0), memprereq); + kelondroCachedRecords.setCacheGrowStati(40 * 1024 * 1024, 20 * 1024 * 1024); + kelondroCache.setCacheGrowStati(40 * 1024 * 1024, 20 * 1024 * 1024); +*/ // update the cluster set this.clusterhashes = yacyCore.seedDB.clusterHashes(getConfig("cluster.peers.yacydomain", "")); @@ -2887,7 +2896,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if ((curThread instanceof serverThread) && ((serverThread)curThread).shutdownInProgress()) throw new InterruptedException("Shutdown in progress ..."); else if (this.terminate || curThread.isInterrupted()) throw new InterruptedException("Shutdown in progress ..."); } - + public void terminate(long delay) { if (delay <= 0) throw new IllegalArgumentException("The shutdown delay must be greater than 0."); (new delayedShutdown(this,delay)).start(); @@ -2908,6 +2917,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } +class MoreMemory extends TimerTask { + public final void run() { + serverMemory.gc(10000, "MoreMemory()"); + } +} + class delayedShutdown extends Thread { private plasmaSwitchboard sb; private long delay; diff --git a/source/de/anomic/server/serverCore.java b/source/de/anomic/server/serverCore.java index f950ace6c..eda42b926 100644 --- a/source/de/anomic/server/serverCore.java +++ b/source/de/anomic/server/serverCore.java @@ -6,7 +6,7 @@ // // $LastChangedDate$ // $LastChangedRevision$ -// $LastChangedBy: $ +// $LastChangedBy$ // // ThreadPool // @@ -414,7 +414,7 @@ public final class serverCore extends serverAbstractThread implements serverThre public void freemem() { // FIXME: can we something here to flush memory? Idea: Reduce the size of some of our various caches. - System.gc(); + serverMemory.gc(2000, "serverCore.freemem()"); // thq } // class body diff --git a/source/de/anomic/server/serverMemory.java b/source/de/anomic/server/serverMemory.java index 6db843e23..d2f82a5dc 100644 --- a/source/de/anomic/server/serverMemory.java +++ b/source/de/anomic/server/serverMemory.java @@ -3,9 +3,9 @@ // (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 22.09.2005 on http://yacy.net // -// $LastChangedDate: 2005-09-21 16:21:45 +0200 (Wed, 21 Sep 2005) $ -// $LastChangedRevision: 763 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // @@ -23,7 +23,6 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - package de.anomic.server; import de.anomic.server.logging.serverLog; @@ -38,11 +37,25 @@ public class serverMemory { private static final long[] gcs = new long[5]; private static int gcs_pos = 0; - + + private static long lastGC; + + public final synchronized static void gc(int last, String info) { // thq + long elapsed = System.currentTimeMillis() - lastGC; + if (elapsed > last) { + long free = free(); + System.gc(); + lastGC = System.currentTimeMillis(); + log.logInfo("[gc] before: " + bytesToString(free) + ", after: " + bytesToString(free()) + ", call: " + info); + } else if (log.isFine()) { + log.logFine("[gc] no execute, last run: " + (elapsed / 1000) + " seconds ago, call: " + info); + } + } + /** @return the amount of freed bytes by a forced GC this method performes */ private static long runGC(final boolean count) { final long memnow = available(); - System.gc(); + gc(1000, "serverMemory.runGC(...)"); final long freed = available() - memnow; if (count) { gcs[gcs_pos] = freed; @@ -89,7 +102,7 @@ public class serverMemory { public static boolean available(long memory, boolean gciffail) { if (available() >= memory) return true; if (!gciffail) return false; - System.gc(); + gc(4000, "serverMemory.available(...)"); return (available() >= memory); }