From a9cea419ef76dc18d8ca46b919933193290235cf Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 17 Mar 2009 13:03:27 +0000 Subject: [PATCH] Integration of the new index data structure IndexCell This is the start of a testing phase for IndexCell data structure which will replace the collections and caching strategy. IndexCall creation and maintenance is fast, has no caching overhead, very low IO load and is the basis for the next data structure, index segments. IndexCell files are stored at DATA//TEXT/RICELL With this commit still the old data structures are used, until a flag in yacy.conf is set. To switch to the new data structure, set useCell = true in yacy.conf. Then you will have no access any more to TEXT/RICACHE and TEXT/RICOLLECTION This code is still bleeding-edge development. Please do not use the new data structure for production now. Future versions may have changed data types, or other storage locations. The next main release will have a migration feature for old data structures. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5724 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- defaults/yacy.init | 5 +- .../anomic/kelondro/text/BufferedIndex.java | 23 ++ .../text/BufferedIndexCollection.java | 74 +++--- source/de/anomic/kelondro/text/IndexCell.java | 230 +++++++++++------- .../kelondro/text/ReferenceContainer.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 8 +- source/de/anomic/plasma/plasmaWordIndex.java | 18 +- source/yacy.java | 4 +- 9 files changed, 228 insertions(+), 138 deletions(-) diff --git a/build.properties b/build.properties index 707145f4d..1e90a7963 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.72 +releaseVersion=0.73 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz diff --git a/defaults/yacy.init b/defaults/yacy.init index e4b5b9634..65be3fbda 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -899,4 +899,7 @@ cgi.allow = false cgi.suffixes = cgi,pl # whether this is a version for a web browser -browserintegration = false \ No newline at end of file +browserintegration = false + +# next index data structure +useCell = false \ No newline at end of file diff --git a/source/de/anomic/kelondro/text/BufferedIndex.java b/source/de/anomic/kelondro/text/BufferedIndex.java index 71fd754f4..c75d5d987 100644 --- a/source/de/anomic/kelondro/text/BufferedIndex.java +++ b/source/de/anomic/kelondro/text/BufferedIndex.java @@ -107,6 +107,17 @@ public interface BufferedIndex extends Index { */ public int getBufferSize(); + /** + * iterate over entries in index. this method differs from the iterator in an Index + * object in such a way that it has the additional 'buffer' flag. When using this method, + * the iteration goes only over the buffer content, or over the backend-content, but + * not over a merged content. + * @param startHash + * @param rot + * @param buffer + * @return + * @throws IOException + */ public CloneableIterator references( String startHash, boolean rot, @@ -114,6 +125,18 @@ public interface BufferedIndex extends Index { ) throws IOException; + /** + * collect reference container in index. this method differs from the collector in an Index + * object in such a way that it has the additional 'buffer' flag. When using this method, + * the collection goes only over the buffer content, or over the backend-content, but + * not over a merged content. + * @param startHash + * @param rot + * @param count + * @param buffer + * @return + * @throws IOException + */ public TreeSet references( String startHash, boolean rot, diff --git a/source/de/anomic/kelondro/text/BufferedIndexCollection.java b/source/de/anomic/kelondro/text/BufferedIndexCollection.java index 16289151c..3e7bd3f1e 100644 --- a/source/de/anomic/kelondro/text/BufferedIndexCollection.java +++ b/source/de/anomic/kelondro/text/BufferedIndexCollection.java @@ -57,8 +57,8 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme public static final int lowcachedivisor = 900; public static final int maxCollectionPartition = 7; // should be 7 - private final IndexBuffer indexCache; - private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster + private final IndexBuffer buffer; + private final IndexCollection collections; public BufferedIndexCollection ( File indexPrimaryTextLocation, @@ -73,15 +73,15 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme if (!(textindexcache.exists())) textindexcache.mkdirs(); if (new File(textindexcache, "index.dhtin.blob").exists()) { // migration of the both caches into one - this.indexCache = new IndexBuffer(textindexcache, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log); + this.buffer = new IndexBuffer(textindexcache, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log); IndexBuffer dhtInCache = new IndexBuffer(textindexcache, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log); for (ReferenceContainer c: dhtInCache) { - this.indexCache.add(c); + this.buffer.add(c); } new File(textindexcache, "index.dhtin.blob").delete(); } else { // read in new BLOB - this.indexCache = new IndexBuffer(textindexcache, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log); + this.buffer = new IndexBuffer(textindexcache, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log); } // create collections storage path @@ -103,24 +103,24 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize); // add the entry - indexCache.add(entries); + buffer.add(entries); cacheFlushControl(); } public void add(final String wordHash, final ReferenceRow entry) throws IOException { // add the entry - indexCache.add(wordHash, entry); + buffer.add(wordHash, entry); cacheFlushControl(); } public boolean has(final String wordHash) { - if (indexCache.has(wordHash)) return true; + if (buffer.has(wordHash)) return true; if (collections.has(wordHash)) return true; return false; } public int count(String key) { - return indexCache.count(key) + collections.count(key); + return buffer.count(key) + collections.count(key); } public ReferenceContainer get(final String wordHash, final Set urlselection) { @@ -131,7 +131,7 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme // get from cache ReferenceContainer container; - container = indexCache.get(wordHash, urlselection); + container = buffer.get(wordHash, urlselection); // get from collection index if (container == null) { @@ -172,22 +172,22 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme final ReferenceContainer c = new ReferenceContainer( wordHash, ReferenceRow.urlEntryRow, - indexCache.count(wordHash)); - c.addAllUnique(indexCache.delete(wordHash)); + buffer.count(wordHash)); + c.addAllUnique(buffer.delete(wordHash)); c.addAllUnique(collections.delete(wordHash)); return c; } public boolean remove(final String wordHash, final String urlHash) { boolean removed = false; - removed = removed | (indexCache.remove(wordHash, urlHash)); + removed = removed | (buffer.remove(wordHash, urlHash)); removed = removed | (collections.remove(wordHash, urlHash)); return removed; } public int remove(final String wordHash, final Set urlHashes) { int removed = 0; - removed += indexCache.remove(wordHash, urlHashes); + removed += buffer.remove(wordHash, urlHashes); removed += collections.remove(wordHash, urlHashes); return removed; } @@ -195,16 +195,16 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme public synchronized CloneableIterator references(final String startHash, final boolean rot, final boolean ram) throws IOException { final CloneableIterator i = wordContainers(startHash, ram); if (rot) { - return new RotateIterator(i, new String(Base64Order.zero(startHash.length())), indexCache.size() + ((ram) ? 0 : collections.size())); + return new RotateIterator(i, new String(Base64Order.zero(startHash.length())), buffer.size() + ((ram) ? 0 : collections.size())); } return i; } private synchronized CloneableIterator wordContainers(final String startWordHash, final boolean ram) throws IOException { - final Order containerOrder = new ReferenceContainerOrder(indexCache.ordering().clone()); + final Order containerOrder = new ReferenceContainerOrder(buffer.ordering().clone()); containerOrder.rotate(ReferenceContainer.emptyContainer(startWordHash, 0)); if (ram) { - return indexCache.references(startWordHash, false); + return buffer.references(startWordHash, false); } return collections.references(startWordHash, false); /* @@ -218,7 +218,7 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme } public void clear() { - indexCache.clear(); + buffer.clear(); try { collections.clear(); } catch (IOException e) { @@ -227,16 +227,16 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme } public void close() { - indexCache.close(); + buffer.close(); collections.close(); } public int size() { - return java.lang.Math.max(collections.size(), indexCache.size()); + return java.lang.Math.max(collections.size(), buffer.size()); } public int minMem() { - return 1024*1024 /* indexing overhead */ + indexCache.minMem() + collections.minMem(); + return 1024*1024 /* indexing overhead */ + buffer.minMem() + collections.minMem(); } @@ -245,23 +245,23 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme */ public int getBufferMaxReferences() { - return indexCache.getBufferMaxReferences(); + return buffer.getBufferMaxReferences(); } public long getBufferMinAge() { - return indexCache.getBufferMinAge(); + return buffer.getBufferMinAge(); } public long getBufferMaxAge() { - return indexCache.getBufferMaxAge(); + return buffer.getBufferMaxAge(); } public long getBufferSizeBytes() { - return indexCache.getBufferSizeBytes(); + return buffer.getBufferSizeBytes(); } public void setBufferMaxWordCount(final int maxWords) { - indexCache.setMaxWordCount(maxWords); + buffer.setMaxWordCount(maxWords); } private void cacheFlushControl() { @@ -274,14 +274,14 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme serverProfiling.update("wordcache", Long.valueOf(cs), true); // To ensure termination an additional counter is used int l = 0; - while (this.indexCache.size() > 0 && (l++ < 100) && (this.indexCache.getBufferMaxReferences() > wCacheMaxChunk)) { - flushCacheOne(this.indexCache); + while (this.buffer.size() > 0 && (l++ < 100) && (this.buffer.getBufferMaxReferences() > wCacheMaxChunk)) { + flushCacheOne(this.buffer); } // next flush more entries if the size exceeds the maximum size of the cache - while (this.indexCache.size() > 0 && - ((this.indexCache.size() > this.indexCache.getMaxWordCount()) || + while (this.buffer.size() > 0 && + ((this.buffer.size() > this.buffer.getMaxWordCount()) || (MemoryControl.available() < collections.minMem()))) { - flushCacheOne(this.indexCache); + flushCacheOne(this.buffer); } if (getBufferSize() != cs) serverProfiling.update("wordcache", Long.valueOf(getBufferSize()), true); } @@ -292,8 +292,8 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme } private synchronized void flushCacheUntil(long timeout) { - while (System.currentTimeMillis() < timeout && indexCache.size() > 0) { - flushCacheOne(indexCache); + while (System.currentTimeMillis() < timeout && buffer.size() > 0) { + flushCacheOne(buffer); } } @@ -318,7 +318,7 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme } public int getBufferSize() { - return indexCache.size(); + return buffer.size(); } public ByteOrder ordering() { @@ -326,11 +326,11 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme } public CloneableIterator references(String startWordHash, boolean rot) { - final Order containerOrder = new ReferenceContainerOrder(this.indexCache.ordering().clone()); + final Order containerOrder = new ReferenceContainerOrder(this.buffer.ordering().clone()); return new MergeIterator( - this.indexCache.references(startWordHash, rot), + this.buffer.references(startWordHash, rot), new MergeIterator( - this.indexCache.references(startWordHash, false), + this.buffer.references(startWordHash, false), this.collections.references(startWordHash, false), containerOrder, ReferenceContainer.containerMergeMethod, diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java index ed88a5e3a..0d9c9477c 100644 --- a/source/de/anomic/kelondro/text/IndexCell.java +++ b/source/de/anomic/kelondro/text/IndexCell.java @@ -36,6 +36,7 @@ import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.MergeIterator; import de.anomic.kelondro.order.Order; +import de.anomic.server.serverProfiling; /* * an index cell is a part of the horizontal index in the new segment-oriented index @@ -48,7 +49,7 @@ import de.anomic.kelondro.order.Order; * another BLOB file in the index array. */ -public final class IndexCell extends AbstractIndex implements Index { +public final class IndexCell extends AbstractBufferedIndex implements BufferedIndex { // class variables private ReferenceContainerArray array; @@ -63,22 +64,14 @@ public final class IndexCell extends AbstractIndex implements Index { ) throws IOException { this.array = new ReferenceContainerArray(cellPath, wordOrder, payloadrow); this.ram = new ReferenceContainerCache(payloadrow, wordOrder); + this.ram.initWriteMode(); this.maxRamEntries = maxRamEntries; } - - private void cacheDump() throws IOException { - // dump the ram - File dumpFile = this.array.newContainerBLOBFile(); - this.ram.dump(dumpFile); - // get a fresh ram cache - this.ram = new ReferenceContainerCache(this.array.rowdef(), this.array.ordering()); - // add the dumped indexContainerBLOB to the array - this.array.mountBLOBContainer(dumpFile); - } - public ByteOrder ordering() { - return this.array.ordering(); - } + + /* + * methods to implement Index + */ /** * add entries to the cell: this adds the new entries always to the RAM part, never to BLOBs @@ -87,108 +80,78 @@ public final class IndexCell extends AbstractIndex implements Index { */ public synchronized void add(ReferenceContainer newEntries) throws IOException { this.ram.add(newEntries); + serverProfiling.update("wordcache", Long.valueOf(this.ram.size()), true); if (this.ram.size() > this.maxRamEntries) cacheDump(); } public synchronized void add(String hash, ReferenceRow entry) throws IOException { this.ram.add(hash, entry); + serverProfiling.update("wordcache", Long.valueOf(this.ram.size()), true); if (this.ram.size() > this.maxRamEntries) cacheDump(); } /** - * clear the RAM and BLOB part, deletes everything in the cell - * @throws IOException + * checks if there is any container for this wordHash, either in RAM or any BLOB */ - public synchronized void clear() throws IOException { - this.ram.clear(); - this.array.clear(); + public boolean has(String wordHash) { + if (this.ram.has(wordHash)) return true; + return this.array.has(wordHash); } - /** - * when a cell is closed, the current RAM is dumped to a file which will be opened as - * BLOB file the next time a cell is opened. A name for the dump is automatically generated - * and is composed of the current date and the cell salt - */ - public synchronized void close() { - // dump the ram + public int count(String wordHash) { + ReferenceContainer c0 = this.ram.get(wordHash, null); + ReferenceContainer c1; try { - this.ram.dump(this.array.newContainerBLOBFile()); + c1 = this.array.get(wordHash); } catch (IOException e) { - e.printStackTrace(); + c1 = null; } - // close all - this.ram.close(); - this.array.close(); + if (c1 == null) { + if (c0 == null) return 0; + return c0.size(); + } + if (c0 == null) return c1.size(); + return c1.size() + c0.size(); } - + /** - * deleting a container affects the containers in RAM and all the BLOB files - * the deleted containers are merged and returned as result of the method + * all containers in the BLOBs and the RAM are merged and returned * @throws IOException */ - public ReferenceContainer delete(String wordHash) throws IOException { - ReferenceContainer c0 = this.ram.delete(wordHash); + public ReferenceContainer get(String wordHash, Set urlselection) throws IOException { + ReferenceContainer c0 = this.ram.get(wordHash, null); ReferenceContainer c1 = this.array.get(wordHash); if (c1 == null) { if (c0 == null) return null; return c0; } - this.array.delete(wordHash); if (c0 == null) return c1; return c1.merge(c0); } /** - * all containers in the BLOBs and the RAM are merged and returned + * deleting a container affects the containers in RAM and all the BLOB files + * the deleted containers are merged and returned as result of the method * @throws IOException */ - public ReferenceContainer get(String wordHash, Set urlselection) throws IOException { - ReferenceContainer c0 = this.ram.get(wordHash, null); + public ReferenceContainer delete(String wordHash) throws IOException { + ReferenceContainer c0 = this.ram.delete(wordHash); ReferenceContainer c1 = this.array.get(wordHash); if (c1 == null) { if (c0 == null) return null; return c0; } + this.array.delete(wordHash); if (c0 == null) return c1; return c1.merge(c0); } - public int count(String wordHash) { - ReferenceContainer c0 = this.ram.get(wordHash, null); - ReferenceContainer c1; - try { - c1 = this.array.get(wordHash); - } catch (IOException e) { - c1 = null; - } - if (c1 == null) { - if (c0 == null) return 0; - return c0.size(); - } - if (c0 == null) return c1.size(); - return c1.size() + c0.size(); - } - - /** - * checks if there is any container for this wordHash, either in RAM or any BLOB - */ - public boolean has(String wordHash) { - if (this.ram.has(wordHash)) return true; - return this.array.has(wordHash); - } - - public int minMem() { - return 10 * 1024 * 1024; - } - /** * remove url references from a selected word hash. this deletes also in the BLOB * files, which means that there exists new gap entries after the deletion * The gaps are never merged in place, but can be eliminated when BLOBs are merged into * new BLOBs. This returns the sum of all url references that have been removed * @throws IOException - * @throws IOException - * @throws IOException */ public int remove(String wordHash, Set urlHashes) throws IOException { int reduced = this.array.replace(wordHash, new RemoveRewriter(urlHashes)); @@ -200,8 +163,24 @@ public final class IndexCell extends AbstractIndex implements Index { return reduced > 0; } - public int size() { - return this.ram.size() + this.array.size(); + private static class RemoveRewriter implements ReferenceContainerArray.ContainerRewriter { + + Set urlHashes; + + public RemoveRewriter(Set urlHashes) { + this.urlHashes = urlHashes; + } + + public RemoveRewriter(String urlHash) { + this.urlHashes = new HashSet(); + this.urlHashes.add(urlHash); + } + + public ReferenceContainer rewrite(ReferenceContainer container) { + container.removeEntries(urlHashes); + return container; + } + } public CloneableIterator references(String startWordHash, boolean rot) { @@ -234,24 +213,97 @@ public final class IndexCell extends AbstractIndex implements Index { true); } - private static class RemoveRewriter implements ReferenceContainerArray.ContainerRewriter { - - Set urlHashes; - - public RemoveRewriter(Set urlHashes) { - this.urlHashes = urlHashes; - } - - public RemoveRewriter(String urlHash) { - this.urlHashes = new HashSet(); - this.urlHashes.add(urlHash); - } - - public ReferenceContainer rewrite(ReferenceContainer container) { - container.removeEntries(urlHashes); - return container; + /** + * clear the RAM and BLOB part, deletes everything in the cell + * @throws IOException + */ + public synchronized void clear() throws IOException { + this.ram.clear(); + this.array.clear(); + } + + /** + * when a cell is closed, the current RAM is dumped to a file which will be opened as + * BLOB file the next time a cell is opened. A name for the dump is automatically generated + * and is composed of the current date and the cell salt + */ + public synchronized void close() { + // dump the ram + try { + this.ram.dump(this.array.newContainerBLOBFile()); + } catch (IOException e) { + e.printStackTrace(); } - + // close all + this.ram.close(); + this.array.close(); + } + + public int size() { + return this.ram.size() + this.array.size(); + } + + public int minMem() { + return 10 * 1024 * 1024; + } + + public ByteOrder ordering() { + return this.array.ordering(); + } + + + /* + * cache control methods + */ + + private void cacheDump() throws IOException { + // dump the ram + File dumpFile = this.array.newContainerBLOBFile(); + this.ram.dump(dumpFile); + // get a fresh ram cache + this.ram = new ReferenceContainerCache(this.array.rowdef(), this.array.ordering()); + this.ram.initWriteMode(); + // add the dumped indexContainerBLOB to the array + this.array.mountBLOBContainer(dumpFile); } + + public void cleanupBuffer(int time) { + // do nothing + } + + + public int getBackendSize() { + return this.array.size(); + } + + + public long getBufferMaxAge() { + return System.currentTimeMillis(); + } + + + public int getBufferMaxReferences() { + return this.ram.maxReferences(); + } + + + public long getBufferMinAge() { + return System.currentTimeMillis(); + } + + + public int getBufferSize() { + return this.ram.size(); + } + + + public long getBufferSizeBytes() { + return 10000 * this.ram.size(); // guessed; we don't know that exactly because there is no statistics here (expensive, not necessary) + } + + + public void setBufferMaxWordCount(int maxWords) { + this.maxRamEntries = maxWords; + } } diff --git a/source/de/anomic/kelondro/text/ReferenceContainer.java b/source/de/anomic/kelondro/text/ReferenceContainer.java index 319aef078..f5e796b08 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainer.java +++ b/source/de/anomic/kelondro/text/ReferenceContainer.java @@ -120,7 +120,7 @@ public class ReferenceContainer extends RowSet { } public ReferenceContainer merge(final ReferenceContainer c) { - return new ReferenceContainer(this.wordHash, this.merge(c)); + return new ReferenceContainer(this.wordHash, super.merge(c)); } public Reference put(final ReferenceRow entry) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 9b732fec1..3ad99c491 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -306,6 +306,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch indexContainerIterator = wordIndex.index().references("AAAAAAAAAAAA", false, false); long urlCounter = 0, wordCounter = 0; @@ -866,7 +866,7 @@ public final class yacy { try { Iterator indexContainerIterator = null; if (resource.equals("all")) { - WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0); + WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0, false); indexContainerIterator = WordIndex.index().references(wordChunkStartHash, false, false); } int counter = 0;