From 117ae78001e9783d57bad4f5c7290151b0ea3d99 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 6 Apr 2008 11:50:15 +0000 Subject: [PATCH] speed enhancement for reading of eco-table indexes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4647 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/yacy/ui/result.java | 2 +- .../kelondro/kelondroBufferedEcoFS.java | 19 ----- source/de/anomic/kelondro/kelondroEcoFS.java | 62 +++++++++++++- .../de/anomic/kelondro/kelondroEcoTable.java | 85 +++++++++++++------ .../kelondro/kelondroRowCollection.java | 2 +- .../de/anomic/kelondro/kelondroSortStack.java | 4 +- .../de/anomic/plasma/plasmaSwitchboard.java | 2 +- 8 files changed, 124 insertions(+), 54 deletions(-) diff --git a/build.properties b/build.properties index 54aabe816..26dac34b5 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.577 +releaseVersion=0.578 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz diff --git a/htroot/yacy/ui/result.java b/htroot/yacy/ui/result.java index bf916aff6..38601b547 100644 --- a/htroot/yacy/ui/result.java +++ b/htroot/yacy/ui/result.java @@ -259,7 +259,7 @@ public class result { prop.put("excluded", "0"); } - if (prop == null || prop.size() == 0) { + if (prop == null || prop.isEmpty()) { if (post.get("search", "").length() < 3) { prop.put("num-results", "2"); // no results - at least 3 chars } else { diff --git a/source/de/anomic/kelondro/kelondroBufferedEcoFS.java b/source/de/anomic/kelondro/kelondroBufferedEcoFS.java index b54fc67ab..7b3dd7711 100644 --- a/source/de/anomic/kelondro/kelondroBufferedEcoFS.java +++ b/source/de/anomic/kelondro/kelondroBufferedEcoFS.java @@ -107,26 +107,7 @@ public class kelondroBufferedEcoFS { public synchronized void add(byte[] b, int start) throws IOException { put(size(), b, start); } -/* - public synchronized void clean(long index, byte[] b, int start) throws IOException { - assert b.length - start >= efs.recordsize; - if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")"); - byte[] bb = buffer.get(new Long(index)); - if (bb == null) { - efs.clean(index, b, start); - } else { - System.arraycopy(bb, 0, b, start, efs.recordsize); - buffer.remove(new Long(index)); - efs.clean(index); - } - } - public synchronized void clean(long index) throws IOException { - if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")"); - buffer.remove(new Long(index)); - efs.clean(index); - } -*/ public synchronized void cleanLast(byte[] b, int start) throws IOException { assert b.length - start >= efs.recordsize; Long i = new Long(size() - 1); diff --git a/source/de/anomic/kelondro/kelondroEcoFS.java b/source/de/anomic/kelondro/kelondroEcoFS.java index 7f35fc695..3a762b901 100644 --- a/source/de/anomic/kelondro/kelondroEcoFS.java +++ b/source/de/anomic/kelondro/kelondroEcoFS.java @@ -24,11 +24,15 @@ package de.anomic.kelondro; +import java.io.BufferedInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.RandomAccessFile; +import java.util.Iterator; /** * The EcoFS is a flat file with records of fixed length. The file does not contain @@ -117,7 +121,7 @@ public class kelondroEcoFS { assert size % recordsize == 0; return size / (long) recordsize; } - + /** * @return the number of records in file plus number of records in buffer * @throws IOException @@ -522,6 +526,62 @@ public class kelondroEcoFS { this.raf.setLength((long) (this.size() - 1) * (long) this.recordsize); } + public static class ChunkIterator implements Iterator { + + private int recordsize, chunksize; + private InputStream stream; + + /** + * create a ChunkIterator + * a ChunkIterator uses a BufferedInputStream to iterate through the file + * and is therefore a fast option to get all elements in the file as a sequence + * @param file: the eco-file + * @param recordsize: the size of the elements in the file + * @param chunksize: the size of the chunks that are returned by next(). remaining bytes until the lenght of recordsize are skipped + * @throws FileNotFoundException + */ + public ChunkIterator(File file, int recordsize, int chunksize) throws FileNotFoundException { + assert (file.exists()); + assert file.length() % recordsize == 0; + this.recordsize = recordsize; + this.chunksize = chunksize; + this.stream = new BufferedInputStream(new FileInputStream(file), 64 * 1024); + } + + public boolean hasNext() { + try { + return stream != null && stream.available() > 0; + } catch (IOException e) { + return false; + } + } + + public byte[] next() { + byte[] chunk = new byte[chunksize]; + int r; + try { + // read the chunk + r = this.stream.read(chunk); + while (r < chunksize) { + r += this.stream.read(chunk, r, chunksize - r); + } + // skip remaining bytes + while (r < recordsize) { + r += this.stream.skip(recordsize - r); + } + return chunk; + } catch (IOException e) { + this.stream = null; + return null; + } + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + } + /** * main - writes some data and checks the tables size (with time measureing) * @param args diff --git a/source/de/anomic/kelondro/kelondroEcoTable.java b/source/de/anomic/kelondro/kelondroEcoTable.java index 0e091a344..af456c24f 100644 --- a/source/de/anomic/kelondro/kelondroEcoTable.java +++ b/source/de/anomic/kelondro/kelondroEcoTable.java @@ -95,10 +95,10 @@ public class kelondroEcoTable implements kelondroIndex { try { // open an existing table file - this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize); - + int fileSize = (int) tableSize(tablefile, rowdef.objectsize); + // initialize index and copy table - int records = (int) Math.max(file.size(), initialSpace); + int records = (int) Math.max(fileSize, initialSpace); long neededRAM4table = ((long) records) * (((long) rowdef.objectsize) + 4L) * 3L; table = ((neededRAM4table < maxarraylength) && ((useTailCache == tailCacheForceUsage) || @@ -118,52 +118,69 @@ public class kelondroEcoTable implements kelondroIndex { System.out.println("*** DEBUG " + tablefile + ": EcoTable " + tablefile.toString() + " has table copy " + ((table == null) ? "DISABLED" : "ENABLED")); // read all elements from the file into the copy table - byte[] record = new byte[rowdef.objectsize]; - byte[] key = new byte[rowdef.primaryKeyLength]; - int fs = (int) file.size(); System.out.print("*** initializing RAM index for EcoTable " + tablefile.getName() + ":"); - for (int i = 0; i < fs; i++) { - // read entry - file.get(i, record, 0); + int i = 0; + byte[] key; + if (table == null) { + Iterator ki = keyIterator(tablefile, rowdef); + while (ki.hasNext()) { + key = ki.next(); + + // write the key into the index table + assert key != null; + if (key == null) {i++; continue;} + index.addi(key, i++); - // write the key into the index table - System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength); - index.addi(key, i); + if ((i % 10000) == 0) { + System.out.print('.'); + System.out.flush(); + } + } + } else { + byte[] record; + key = new byte[rowdef.primaryKeyLength]; + Iterator ri = new kelondroEcoFS.ChunkIterator(tablefile, rowdef.objectsize, rowdef.objectsize); + while (ri.hasNext()) { + record = ri.next(); + assert record != null; + if (record == null) {i++; continue;} + System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength); + + // write the key into the index table + index.addi(key, i++); - // write the tail into the table - if (table != null) table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true)); + // write the tail into the table + table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true)); - if ((i % 10000) == 0) { - System.out.print('.'); - System.out.flush(); + if ((i % 10000) == 0) { + System.out.print('.'); + System.out.flush(); + } } } + + // check consistency System.out.print(" -ordering- .."); System.out.flush(); - // check consistency + this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize); ArrayList doubles = index.removeDoubles(); System.out.println(" -removed " + doubles.size() + " doubles- done."); if (doubles.size() > 0) { System.out.println("DEBUG " + tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles"); // from all the doubles take one, put it back to the index and remove the others from the file - Iterator i = doubles.iterator(); - Integer[] ds; // first put back one element each - while (i.hasNext()) { - ds = i.next(); + byte[] record = new byte[rowdef.objectsize]; + key = new byte[rowdef.primaryKeyLength]; + for (Integer[] ds: doubles) { file.get(ds[0].longValue(), record, 0); System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength); index.addi(key, ds[0].intValue()); } // then remove the other doubles by removing them from the table, but do a re-indexing while doing that // first aggregate all the delete positions because the elements from the top positions must be removed first - i = doubles.iterator(); TreeSet delpos = new TreeSet(); - while (i.hasNext()) { - ds = i.next(); - for (int j = 1; j < ds.length; j++) { - delpos.add(ds[j]); - } + for (Integer[] ds: doubles) { + for (int j = 1; j < ds.length; j++) delpos.add(ds[j]); } // now remove the entries in a sorted way (top-down) Integer top; @@ -191,6 +208,18 @@ public class kelondroEcoTable implements kelondroIndex { tableTracker.put(tablefile.toString(), this); } + /** + * a KeyIterator + * @param file: the eco-file + * @param rowdef: the row definition + * @throws FileNotFoundException + * @return an iterator for all keys in the file + */ + public Iterator keyIterator(File file, kelondroRow rowdef) throws FileNotFoundException { + assert rowdef.primaryKeyIndex == 0; + return new kelondroEcoFS.ChunkIterator(file, rowdef.objectsize, rowdef.primaryKeyLength); + } + public static long tableSize(File tablefile, int recordsize) { // returns number of records in table return kelondroEcoFS.tableSize(tablefile, recordsize); diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java index 824da2bdf..2a84a0cc7 100644 --- a/source/de/anomic/kelondro/kelondroRowCollection.java +++ b/source/de/anomic/kelondro/kelondroRowCollection.java @@ -477,7 +477,7 @@ public class kelondroRowCollection { public synchronized void select(Set keys) { // removes all entries but the ones given by urlselection - if ((keys == null) || (keys.size() == 0)) return; + if ((keys == null) || (keys.isEmpty())) return; Iterator i = rows(); kelondroRow.Entry row; while (i.hasNext()) { diff --git a/source/de/anomic/kelondro/kelondroSortStack.java b/source/de/anomic/kelondro/kelondroSortStack.java index f3da6d989..8cefffe0c 100644 --- a/source/de/anomic/kelondro/kelondroSortStack.java +++ b/source/de/anomic/kelondro/kelondroSortStack.java @@ -78,7 +78,7 @@ public class kelondroSortStack { public synchronized stackElement top() { // returns the element that is currently on top of the stack - if (this.onstack.size() == 0) return null; + if (this.onstack.isEmpty()) return null; Long w = this.onstack.firstKey(); E element = this.onstack.get(w); return new stackElement(element, w); @@ -88,7 +88,7 @@ public class kelondroSortStack { // returns the element that is currently on top of the stack // it is removed and added to the offstack list // this is exactly the same as element(offstack.size()) - if (this.onstack.size() == 0) return null; + if (this.onstack.isEmpty()) return null; Long w = this.onstack.firstKey(); E element = this.onstack.remove(w); stackElement se = new stackElement(element, w); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 58b2ead28..896fc1a3e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1643,7 +1643,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch