From abdd4aa41482d7b1ebfdc03a9673ad6d007a99ee Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 29 Dec 2008 21:36:27 +0000 Subject: [PATCH] added a index dump for blob heaps: this will increase the shutdown time for at most some seconds, but will speed up the start-up git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5419 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- .../de/anomic/kelondro/kelondroBLOBHeap.java | 168 ++++++++++++++---- .../anomic/kelondro/kelondroBytesLongMap.java | 59 ++++++ .../de/anomic/kelondro/kelondroRAMIndex.java | 2 - 4 files changed, 191 insertions(+), 40 deletions(-) diff --git a/build.properties b/build.properties index dc6f4aa30..ca66a9402 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.616 +releaseVersion=0.617 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz diff --git a/source/de/anomic/kelondro/kelondroBLOBHeap.java b/source/de/anomic/kelondro/kelondroBLOBHeap.java index d82293da4..73c550fbc 100755 --- a/source/de/anomic/kelondro/kelondroBLOBHeap.java +++ b/source/de/anomic/kelondro/kelondroBLOBHeap.java @@ -40,12 +40,12 @@ import de.anomic.server.logging.serverLog; public final class kelondroBLOBHeap implements kelondroBLOB { - + private int keylength; // the length of the primary key private kelondroBytesLongMap index; // key/seek relation for used records private TreeMap free; // list of {size, seek} pairs denoting space and position of free records private final File heapFile; // the file of the heap private final kelondroByteOrder ordering; // the ordering on keys - private kelondroCachedFileRA file; // a random access to the file + private kelondroCachedFileRA file; // a random access to the file private HashMap buffer; // a write buffer to limit IO to the file; attention: Maps cannot use byte[] as key private int buffersize; // bytes that are buffered in buffer private int buffermax; // maximum size of the buffer @@ -84,17 +84,105 @@ public final class kelondroBLOBHeap implements kelondroBLOB { this.ordering = ordering; this.heapFile = heapFile; this.buffermax = buffermax; - + this.keylength = keylength; this.index = null; // will be created as result of initialization process this.free = new TreeMap(); this.buffer = new HashMap(); this.buffersize = 0; this.file = new kelondroCachedFileRA(heapFile); + + // read or initialize the index + if (initIndexReadDump(heapFile)) { + // verify that everything worked just fine + // pick some elements of the index + Iterator i = this.index.keys(true, null); + int c = 3; + byte[] b, b1 = new byte[index.row().primaryKeyLength]; + long pos; + boolean ok = true; + while (i.hasNext() && c-- > 0) { + b = i.next(); + pos = this.index.getl(b); + file.seek(pos + 4); + file.readFully(b1, 0, b1.length); + if (this.ordering.compare(b, b1) != 0) { + ok = false; + break; + } + } + if (!ok) { + serverLog.logWarning("kelondroBLOBHeap", "verification of idx file for " + heapFile.toString() + " failed, re-building index"); + initIndexReadFromHeap(); + } else { + serverLog.logInfo("kelondroBLOBHeap", "using a dump of the index of " + heapFile.toString() + "."); + } + } else { + // if we did not have a dump, create a new index + initIndexReadFromHeap(); + } + + /* + // DEBUG + Iterator i = index.keys(true, null); + //byte[] b; + int c = 0; + while (i.hasNext()) { + key = i.next(); + System.out.println("*** DEBUG BLOBHeap " + this.name() + " KEY=" + new String(key)); + //b = get(key); + //System.out.println("BLOB=" + new String(b)); + //System.out.println(); + c++; + if (c >= 20) break; + } + System.out.println("*** DEBUG - counted " + c + " BLOBs"); + */ + } + + private boolean initIndexReadDump(File f) { + // look for an index dump and read it if it exist + // if this is successfull, return true; otherwise false + File ff = fingerprintFile(f); + if (!ff.exists()) { + deleteAllFingerprints(f); + return false; + } + + // there is a file: read it: + try { + this.index = new kelondroBytesLongMap(this.keylength, this.ordering, ff); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + // an index file is a one-time throw-away object, so just delete it now + ff.delete(); + + // everything is fine now + return this.index.size() > 0; + } + + private File fingerprintFile(File f) { + String fingerprint = kelondroDigest.fastFingerprintB64(f, false).substring(0, 12); + return new File(f.getParentFile(), f.getName() + "." + fingerprint + ".idx"); + } + + private void deleteAllFingerprints(File f) { + File d = f.getParentFile(); + String n = f.getName(); + String[] l = d.list(); + for (int i = 0; i < l.length; i++) { + if (l[i].startsWith(n) && l[i].endsWith(".idx")) new File(d, l[i]).delete(); + } + } + + private void initIndexReadFromHeap() throws IOException { + // this initializes the this.index object by reading positions from the heap file + + kelondroBytesLongMap.initDataConsumer indexready = kelondroBytesLongMap.asynchronusInitializer(keylength, this.ordering, 0, Math.max(10, (int) (Runtime.getRuntime().freeMemory() / (10 * 1024 * 1024)))); byte[] key = new byte[keylength]; int reclen; long seek = 0; - kelondroBytesLongMap.initDataConsumer indexready = kelondroBytesLongMap.asynchronusInitializer(keylength, this.ordering, 0, Math.max(10, (int) (Runtime.getRuntime().freeMemory() / (10 * 1024 * 1024)))); - loop: while (true) { // don't test available() here because this does not work for files > 2GB try { @@ -136,6 +224,21 @@ public final class kelondroBLOBHeap implements kelondroBLOB { } indexready.finish(); + // do something useful in between + mergeFreeEntries(); + + // finish the index generation + try { + this.index = indexready.result(); + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (ExecutionException e) { + e.printStackTrace(); + } + } + + private void mergeFreeEntries() throws IOException { + // try to merge free entries if (this.free.size() > 1) { int merged = 0; @@ -148,11 +251,11 @@ public final class kelondroBLOBHeap implements kelondroBLOB { // check if they follow directly if (lastFree.getKey() + lastFree.getValue() + 4 == nextFree.getKey()) { // merge those records - file.seek(lastFree.getKey()); + this.file.seek(lastFree.getKey()); lastFree.setValue(lastFree.getValue() + nextFree.getValue() + 4); // this updates also the free map - file.writeInt(lastFree.getValue()); - file.seek(nextFree.getKey()); - file.writeInt(0); + this.file.writeInt(lastFree.getValue()); + this.file.seek(nextFree.getKey()); + this.file.writeInt(0); i.remove(); merged++; } else { @@ -162,30 +265,6 @@ public final class kelondroBLOBHeap implements kelondroBLOB { serverLog.logInfo("kelondroBLOBHeap", "BLOB " + heapFile.getName() + ": merged " + merged + " free records"); } - try { - this.index = indexready.result(); - } catch (InterruptedException e) { - e.printStackTrace(); - } catch (ExecutionException e) { - e.printStackTrace(); - } - - /* - // DEBUG - Iterator i = index.keys(true, null); - //byte[] b; - int c = 0; - while (i.hasNext()) { - key = i.next(); - System.out.println("*** DEBUG BLOBHeap " + this.name() + " KEY=" + new String(key)); - //b = get(key); - //System.out.println("BLOB=" + new String(b)); - //System.out.println(); - c++; - if (c >= 20) break; - } - System.out.println("*** DEBUG - counted " + c + " BLOBs"); - */ } public String name() { @@ -311,7 +390,14 @@ public final class kelondroBLOBHeap implements kelondroBLOB { // read the key final byte[] keyf = new byte[index.row().primaryKeyLength]; file.readFully(keyf, 0, keyf.length); - assert this.ordering.compare(key, keyf) == 0; + if (this.ordering.compare(key, keyf) != 0) { + // verification of the indexed access failed. we must re-read the index + serverLog.logWarning("kelondroBLOBHeap", "verification indexed access for " + heapFile.toString() + " failed, re-building index"); + // this is a severe operation, it should never happen. + // but if the process ends in this state, it would completey fail + // if the index is not rebuild now at once + initIndexReadFromHeap(); + } // read the blob blob = new byte[len]; @@ -371,16 +457,24 @@ public final class kelondroBLOBHeap implements kelondroBLOB { } catch (IOException e) { e.printStackTrace(); } - index.close(); - free.clear(); try { file.close(); } catch (final IOException e) { e.printStackTrace(); } + file = null; + // now we can create a dump of the index, to speed up the next start + try { + long start = System.currentTimeMillis(); + index.dump(fingerprintFile(this.heapFile)); + serverLog.logInfo("kelondroBLOBHeap", "wrote a dump for the " + this.index.size() + " index entries of " + heapFile.getName()+ " in " + (System.currentTimeMillis() - start) + " milliseconds."); + } catch (IOException e) { + e.printStackTrace(); + } + index.close(); + free.clear(); index = null; free = null; - file = null; } /** diff --git a/source/de/anomic/kelondro/kelondroBytesLongMap.java b/source/de/anomic/kelondro/kelondroBytesLongMap.java index 3c90867f6..2c1e2d8fc 100644 --- a/source/de/anomic/kelondro/kelondroBytesLongMap.java +++ b/source/de/anomic/kelondro/kelondroBytesLongMap.java @@ -24,7 +24,14 @@ package de.anomic.kelondro; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.concurrent.ArrayBlockingQueue; @@ -40,10 +47,62 @@ public class kelondroBytesLongMap { private final kelondroRow rowdef; private kelondroRAMIndex index; + /** + * initialize a BytesLongMap + * This may store a key and a long value for each key. + * The class is used as index for database files + * @param keylength + * @param objectOrder + * @param space + */ public kelondroBytesLongMap(final int keylength, final kelondroByteOrder objectOrder, final int space) { this.rowdef = new kelondroRow(new kelondroColumn[]{new kelondroColumn("key", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, keylength, "key"), new kelondroColumn("long c-8 {b256}")}, objectOrder, 0); this.index = new kelondroRAMIndex(rowdef, space); } + + /** + * initialize a BytesLongMap with the content of a dumped index + * @param keylength + * @param objectOrder + * @param file + * @throws IOException + */ + public kelondroBytesLongMap(final int keylength, final kelondroByteOrder objectOrder, final File file) throws IOException { + this(keylength, objectOrder, (int) (file.length() / (keylength + 8))); + // read the index dump and fill the index + InputStream is = new BufferedInputStream(new FileInputStream(file), 1024 * 1024); + byte[] a = new byte[keylength + 8]; + int c; + while (true) { + c = is.read(a); + if (c <= 0) break; + this.index.addUnique(this.rowdef.newEntry(a)); + } + assert this.index.size() == file.length() / (keylength + 8); + } + + /** + * write a dump of the index to a file. All entries are written in order + * which makes it possible to read them again in a fast way + * @param file + * @return the number of written entries + * @throws IOException + */ + public int dump(File file) throws IOException { + // we must use an iterator from the combined index, because we need the entries sorted + // otherwise we could just write the byte[] from the in kelondroRowSet which would make + // everything much faster, but this is not an option here. + Iterator i = this.index.rows(true, null); + OutputStream os = new BufferedOutputStream(new FileOutputStream(file), 1024 * 1024); + int c = 0; + while (i.hasNext()) { + os.write(i.next().bytes()); + c++; + } + os.flush(); + os.close(); + return c; + } public kelondroRow row() { return index.row(); diff --git a/source/de/anomic/kelondro/kelondroRAMIndex.java b/source/de/anomic/kelondro/kelondroRAMIndex.java index 8a8b1a570..cbb0b4cad 100644 --- a/source/de/anomic/kelondro/kelondroRAMIndex.java +++ b/source/de/anomic/kelondro/kelondroRAMIndex.java @@ -43,8 +43,6 @@ public class kelondroRAMIndex implements kelondroIndex { reset(initialspace); } - // TODO: import and export method to write index completely as-is to file and restore it again - public void clear() { reset(0); }