speed enhancement for reading of eco-table indexes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4647 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 117ae78001
parent e96ecd269f
commit 117ae78001
8 changed files with 124 additions and 54 deletions
--- a/build.properties
+++ b/build.properties
@ -3,7 +3,7 @@ javacSource=1.5
 javacTarget=1.5

 # Release Configuration
-releaseVersion=0.577
+releaseVersion=0.578
 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
--- a/htroot/yacy/ui/result.java
+++ b/htroot/yacy/ui/result.java
@ -259,7 +259,7 @@ public class result {
                prop.put("excluded", "0");
            }

-            if (prop == null || prop.size() == 0) {
+            if (prop == null || prop.isEmpty()) {
                if (post.get("search", "").length() < 3) {
                    prop.put("num-results", "2"); // no results - at least 3 chars
                } else {
--- a/source/de/anomic/kelondro/kelondroBufferedEcoFS.java
+++ b/source/de/anomic/kelondro/kelondroBufferedEcoFS.java
@ -107,26 +107,7 @@ public class kelondroBufferedEcoFS {
    public synchronized void add(byte[] b, int start) throws IOException {
        put(size(), b, start);
    }
-/*
-    public synchronized void clean(long index, byte[] b, int start) throws IOException {
-        assert b.length - start >= efs.recordsize;
-        if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")");
-        byte[] bb = buffer.get(new Long(index));
-        if (bb == null) {
-            efs.clean(index, b, start);
-        } else {
-            System.arraycopy(bb, 0, b, start, efs.recordsize);
-            buffer.remove(new Long(index));
-            efs.clean(index);
-        }
-    }

-    public synchronized void clean(long index) throws IOException {
-        if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")");
-        buffer.remove(new Long(index));
-        efs.clean(index);
-    }
-*/
    public synchronized void cleanLast(byte[] b, int start) throws IOException {
        assert b.length - start >= efs.recordsize;
        Long i = new Long(size() - 1);
--- a/source/de/anomic/kelondro/kelondroEcoFS.java
+++ b/source/de/anomic/kelondro/kelondroEcoFS.java
@ -24,11 +24,15 @@

 package de.anomic.kelondro;

+import java.io.BufferedInputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.RandomAccessFile;
+import java.util.Iterator;

 /**
 * The EcoFS is a flat file with records of fixed length. The file does not contain
@ -117,7 +121,7 @@ public class kelondroEcoFS {
        assert size % recordsize == 0;
        return size / (long) recordsize;
    }
-
+    
    /**
     * @return the number of records in file plus number of records in buffer
     * @throws IOException
@ -522,6 +526,62 @@ public class kelondroEcoFS {
        this.raf.setLength((long) (this.size() - 1) * (long) this.recordsize);
    }
    
+    public static class ChunkIterator implements Iterator<byte[]> {
+
+        private int recordsize, chunksize;
+        private InputStream stream;
+        
+        /**
+         * create a ChunkIterator
+         * a ChunkIterator uses a BufferedInputStream to iterate through the file
+         * and is therefore a fast option to get all elements in the file as a sequence
+         * @param file: the eco-file
+         * @param recordsize: the size of the elements in the file
+         * @param chunksize: the size of the chunks that are returned by next(). remaining bytes until the lenght of recordsize are skipped
+         * @throws FileNotFoundException 
+         */
+        public ChunkIterator(File file, int recordsize, int chunksize) throws FileNotFoundException {
+            assert (file.exists());
+            assert file.length() % recordsize == 0;
+            this.recordsize = recordsize;
+            this.chunksize = chunksize;
+            this.stream = new BufferedInputStream(new FileInputStream(file), 64 * 1024);
+        }
+        
+        public boolean hasNext() {
+            try {
+                return stream != null && stream.available() > 0;
+            } catch (IOException e) {
+                return false;
+            }
+        }
+
+        public byte[] next() {
+            byte[] chunk = new byte[chunksize];
+            int r;
+            try {
+                // read the chunk
+                r = this.stream.read(chunk);
+                while (r < chunksize) {
+                    r += this.stream.read(chunk, r, chunksize - r);
+                }
+                // skip remaining bytes
+                while (r < recordsize) {
+                    r += this.stream.skip(recordsize - r);
+                }
+                return chunk;
+            } catch (IOException e) {
+                this.stream = null;
+                return null;
+            }
+        }
+
+        public void remove() {
+            throw new UnsupportedOperationException();
+        }
+        
+    }
+    
    /**
     * main - writes some data and checks the tables size (with time measureing)
     * @param args
--- a/source/de/anomic/kelondro/kelondroEcoTable.java
+++ b/source/de/anomic/kelondro/kelondroEcoTable.java
@ -95,10 +95,10 @@ public class kelondroEcoTable implements kelondroIndex {
        
        try {
            // open an existing table file
-            this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize);
-        
+            int fileSize = (int) tableSize(tablefile, rowdef.objectsize);
+            
            // initialize index and copy table
-            int  records = (int) Math.max(file.size(), initialSpace);
+            int  records = (int) Math.max(fileSize, initialSpace);
            long neededRAM4table = ((long) records) * (((long) rowdef.objectsize) + 4L) * 3L;
            table = ((neededRAM4table < maxarraylength) &&
                     ((useTailCache == tailCacheForceUsage) ||
@ -118,52 +118,69 @@ public class kelondroEcoTable implements kelondroIndex {
            System.out.println("*** DEBUG " + tablefile + ": EcoTable " + tablefile.toString() + " has table copy " + ((table == null) ? "DISABLED" : "ENABLED"));

            // read all elements from the file into the copy table
-            byte[] record = new byte[rowdef.objectsize];
-            byte[] key = new byte[rowdef.primaryKeyLength];
-            int fs = (int) file.size();
            System.out.print("*** initializing RAM index for EcoTable " + tablefile.getName() + ":");
-            for (int i = 0; i < fs; i++) {
-                // read entry
-                file.get(i, record, 0);
+            int i = 0;
+            byte[] key;
+            if (table == null) {
+                Iterator<byte[]> ki = keyIterator(tablefile, rowdef);
+                while (ki.hasNext()) {
+                    key = ki.next();
+                
+                    // write the key into the index table
+                    assert key != null;
+                    if (key == null) {i++; continue;}
+                    index.addi(key, i++);
            
-                // write the key into the index table
-                System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
-                index.addi(key, i);
+                    if ((i % 10000) == 0) {
+                        System.out.print('.');
+                        System.out.flush();
+                    }
+                }
+            } else {
+                byte[] record;
+                key = new byte[rowdef.primaryKeyLength];
+                Iterator<byte[]> ri = new kelondroEcoFS.ChunkIterator(tablefile, rowdef.objectsize, rowdef.objectsize);
+                while (ri.hasNext()) {
+                    record = ri.next();
+                    assert record != null;
+                    if (record == null) {i++; continue;}
+                    System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
+                    
+                    // write the key into the index table
+                    index.addi(key, i++);
            
-                // write the tail into the table
-                if (table != null) table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true));
+                    // write the tail into the table
+                    table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true));
                
-                if ((i % 10000) == 0) {
-                    System.out.print('.');
-                    System.out.flush();
+                    if ((i % 10000) == 0) {
+                        System.out.print('.');
+                        System.out.flush();
+                    }
                }
            }
+            
+            // check consistency
            System.out.print(" -ordering- ..");
            System.out.flush();
-            // check consistency
+            this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize);
            ArrayList<Integer[]> doubles = index.removeDoubles();
            System.out.println(" -removed " + doubles.size() + " doubles- done.");
            if (doubles.size() > 0) {
                System.out.println("DEBUG " + tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
                // from all the doubles take one, put it back to the index and remove the others from the file
-                Iterator<Integer[]> i = doubles.iterator();
-                Integer[] ds;
                // first put back one element each
-                while (i.hasNext()) {
-                    ds = i.next();
+                byte[] record = new byte[rowdef.objectsize];
+                key = new byte[rowdef.primaryKeyLength];
+                for (Integer[] ds: doubles) {
                    file.get(ds[0].longValue(), record, 0);
                    System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
                    index.addi(key, ds[0].intValue());
                }
                // then remove the other doubles by removing them from the table, but do a re-indexing while doing that
                // first aggregate all the delete positions because the elements from the top positions must be removed first
-                i = doubles.iterator();
                TreeSet<Integer> delpos = new TreeSet<Integer>();
-                while (i.hasNext()) {
-                    ds = i.next();
-                    for (int j = 1; j < ds.length; j++) {
-                        delpos.add(ds[j]);
-                    }
+                for (Integer[] ds: doubles) {
+                    for (int j = 1; j < ds.length; j++) delpos.add(ds[j]);
                }
                // now remove the entries in a sorted way (top-down)
                Integer top;
@ -191,6 +208,18 @@ public class kelondroEcoTable implements kelondroIndex {
        tableTracker.put(tablefile.toString(), this);
    }
    
+    /**
+     * a KeyIterator
+     * @param file: the eco-file
+     * @param rowdef: the row definition
+     * @throws FileNotFoundException 
+     * @return an iterator for all keys in the file
+     */
+    public Iterator<byte[]> keyIterator(File file, kelondroRow rowdef) throws FileNotFoundException {
+        assert rowdef.primaryKeyIndex == 0;
+        return new kelondroEcoFS.ChunkIterator(file, rowdef.objectsize, rowdef.primaryKeyLength);
+    }
+    
    public static long tableSize(File tablefile, int recordsize) {
        // returns number of records in table
        return kelondroEcoFS.tableSize(tablefile, recordsize);
--- a/source/de/anomic/kelondro/kelondroRowCollection.java
+++ b/source/de/anomic/kelondro/kelondroRowCollection.java
@ -477,7 +477,7 @@ public class kelondroRowCollection {
    
    public synchronized void select(Set<String> keys) {
        // removes all entries but the ones given by urlselection
-        if ((keys == null) || (keys.size() == 0)) return;
+        if ((keys == null) || (keys.isEmpty())) return;
        Iterator<kelondroRow.Entry> i = rows();
        kelondroRow.Entry row;
        while (i.hasNext()) {
--- a/source/de/anomic/kelondro/kelondroSortStack.java
+++ b/source/de/anomic/kelondro/kelondroSortStack.java
@ -78,7 +78,7 @@ public class kelondroSortStack<E> {
    
    public synchronized stackElement top() {
        // returns the element that is currently on top of the stack
-        if (this.onstack.size() == 0) return null;
+        if (this.onstack.isEmpty()) return null;
        Long w = this.onstack.firstKey();
        E element = this.onstack.get(w);
        return new stackElement(element, w);
@ -88,7 +88,7 @@ public class kelondroSortStack<E> {
        // returns the element that is currently on top of the stack
        // it is removed and added to the offstack list
        // this is exactly the same as element(offstack.size())
-        if (this.onstack.size() == 0) return null;
+        if (this.onstack.isEmpty()) return null;
        Long w = this.onstack.firstKey();
        E element = this.onstack.remove(w);
        stackElement se = new stackElement(element, w);
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1643,7 +1643,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
         * a) the user has configured to use the htcache or
         * b) the content should be indexed
         * ========================================================================= */        
-        if ((entry.profile().storeHTCache()) || (doIndexing && isSupportedContent)) {
+        if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && isSupportedContent)) {
            // store response header            
            if (entry.writeResourceInfo()) {
                this.log.logInfo("WROTE HEADER for " + entry.cacheFile());