added new row interator in kelondro tables files that enumerates rows

without an order by the primary key. The result is a very fast enumeration of the Eco table data structure. Other table data types are not affected. The new enumerator is used for the url export function that can be accessed from the online interface (Index Administration -> URL References -> Export). This export should now be much faster, if all url database files are from type Eco The new enumeration is also used at other functions in YaCy, i.e. the initialization of the crawl balancer and the initialization of YaCy News. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5647 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 8444357291
parent e8f5f2f612
commit 8444357291
15 changed files with 219 additions and 20 deletions
--- a/source/dbtest.java
+++ b/source/dbtest.java
@ -222,7 +222,7 @@ public class dbtest {
        boolean eq = true;
        Row.Entry test_entry, reference_entry;
        
-        Iterator<Row.Entry> i = test.rows(true, null);
+        Iterator<Row.Entry> i = test.rows();
        System.out.println("* Testing now by enumeration over test table");
        final long ts = System.currentTimeMillis();
        while (i.hasNext()) {
@ -234,7 +234,7 @@ public class dbtest {
            }
        }
        
-        i = reference.rows(true, null);
+        i = reference.rows();
        System.out.println("* Testing now by enumeration over reference table");
        final long rs = System.currentTimeMillis();
        while (i.hasNext()) {
@ -390,8 +390,8 @@ public class dbtest {
            
            if (command.equals("list")) {
                CloneableIterator<Row.Entry> i = null;
-                if (table_test instanceof Tree) i = ((Tree) table_test).rows(true, null);
-                if (table_test instanceof SQLTable) i = ((SQLTable) table_test).rows(true, null);
+                if (table_test instanceof Tree) i = ((Tree) table_test).rows();
+                if (table_test instanceof SQLTable) i = ((SQLTable) table_test).rows();
                if(i != null) {
                    Row.Entry row;
                    while (i.hasNext()) {
--- a/source/de/anomic/crawler/Balancer.java
+++ b/source/de/anomic/crawler/Balancer.java
@ -164,7 +164,7 @@ public class Balancer {
        // returns number of deletions
        
        // first find a list of url hashes that shall be deleted
-        final Iterator<Row.Entry> i = urlFileIndex.rows(true, null);
+        final Iterator<Row.Entry> i = urlFileIndex.rows();
        final HashSet<String> urlHashes = new HashSet<String>();
        Row.Entry rowEntry;
        CrawlEntry crawlEntry;
@ -646,7 +646,7 @@ public class Balancer {
        private Iterator<Row.Entry> rowIterator;
        
        public EntryIterator() throws IOException {
-            rowIterator = urlFileIndex.rows(true, null);
+            rowIterator = urlFileIndex.rows();
        }
        
        public boolean hasNext() {
--- a/source/de/anomic/index/indexRepositoryReference.java
+++ b/source/de/anomic/index/indexRepositoryReference.java
@ -151,6 +151,11 @@ public final class indexRepositoryReference {
        return urlIndexFile.has(urlHash.getBytes());
    }

+    public CloneableIterator<indexURLReference> entries() throws IOException {
+        // enumerates entry elements
+        return new kiter();
+    }
+
    public CloneableIterator<indexURLReference> entries(final boolean up, final String firstHash) throws IOException {
        // enumerates entry elements
        return new kiter(up, firstHash);
@ -162,6 +167,12 @@ public final class indexRepositoryReference {
        private final boolean error;
        boolean up;

+        public kiter() throws IOException {
+            this.up = true;
+            this.iter = urlIndexFile.rows();
+            this.error = false;
+        }
+
        public kiter(final boolean up, final String firstHash) throws IOException {
            this.up = up;
            this.iter = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes());
@ -439,7 +450,7 @@ public final class indexRepositoryReference {
                        count++;
                    }
                } else {
-                    final Iterator<indexURLReference> i = entries(true, null); // iterates indexURLEntry objects
+                    final Iterator<indexURLReference> i = entries(); // iterates indexURLEntry objects
                    indexURLReference entry;
                    indexURLReference.Components comp;
                    String url;
--- a/source/de/anomic/kelondro/blob/Cache.java
+++ b/source/de/anomic/kelondro/blob/Cache.java
@ -454,6 +454,10 @@ public class Cache implements ObjectIndex {
        return index.rows(up, firstKey);
    }

+    public synchronized CloneableIterator<Row.Entry> rows() throws IOException {
+        return index.rows();
+    }
+
    public int size() {
        return index.size();
    }
--- a/source/de/anomic/kelondro/index/IntBytesMap.java
+++ b/source/de/anomic/kelondro/index/IntBytesMap.java
@ -188,17 +188,11 @@ public class IntBytesMap {
                index0.uniq();
                index1 = new RowSet(rowdef, 0);
            }
-            return index0.rows(true, null);
+            return index0.rows();
        } else {
        	assert (index1 != null);
-            return index1.rows(true, null);
+            return index1.rows();
        }
-//        return new kelondroMergeIterator<kelondroRow.Entry>(
-//    				index0.rows(true, null),
-//    				index1.rows(true, null),
-//    				entryOrder,
-//    				kelondroMergeIterator.simpleMerge,
-//                    true);
    }
    
    public void flush() {
--- a/source/de/anomic/kelondro/index/ObjectIndex.java
+++ b/source/de/anomic/kelondro/index/ObjectIndex.java
@ -54,7 +54,8 @@ public interface ObjectIndex {
    public Row.Entry remove(byte[] key) throws IOException;
    public Row.Entry removeOne() throws IOException;
    public CloneableIterator<byte[]> keys(boolean up, byte[] firstKey) throws IOException; // iterates only the key
-    public CloneableIterator<Row.Entry> rows(boolean up, byte[] firstKey) throws IOException; // iterates the whole row
+    public CloneableIterator<Row.Entry> rows(boolean up, byte[] firstKey) throws IOException; // iterates the whole row using the order of the keys
+    public CloneableIterator<Row.Entry> rows() throws IOException; // iterates the whole row without any order
    public void deleteOnExit();
    public void clear() throws IOException;
    public void close();
--- a/source/de/anomic/kelondro/index/RAMIndex.java
+++ b/source/de/anomic/kelondro/index/RAMIndex.java
@ -24,6 +24,7 @@

 package de.anomic.kelondro.index;

+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.Iterator;
@ -32,6 +33,7 @@ import java.util.List;
 import de.anomic.kelondro.index.Row.Entry;
 import de.anomic.kelondro.order.CloneableIterator;
 import de.anomic.kelondro.order.MergeIterator;
+import de.anomic.kelondro.order.StackIterator;

 public class RAMIndex implements ObjectIndex {
    
@ -226,6 +228,27 @@ public class RAMIndex implements ObjectIndex {
                true);
    }
    
+    public synchronized CloneableIterator<Row.Entry> rows() throws IOException {
+        // returns the row-iterator of the underlying kelondroIndex
+        if (index1 == null) {
+            // finish initialization phase
+            index0.sort();
+            index0.uniq();
+            index1 = new RowSet(rowdef, 0);
+            return index0.rows();
+        }
+        assert (index1 != null);
+        if (index0 == null) {
+            //assert consistencyAnalysis0() : "consistency problem: " + consistencyAnalysis();
+            return index1.rows();
+        }
+        // index0 should be sorted
+        // sort index1 to enable working of the merge iterator
+        index1.sort();
+        //assert consistencyAnalysis0() : "consistency problem: " + consistencyAnalysis();
+        return new StackIterator<Row.Entry>(index0.rows(), index1.rows());
+    }
+    
    public synchronized void close() {
        if (index0 != null) index0.close();
        if (index1 != null) index1.close();
--- a/source/de/anomic/kelondro/index/RowSet.java
+++ b/source/de/anomic/kelondro/index/RowSet.java
@ -336,6 +336,10 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
        return new rowIterator(up, firstKey);
    }
    
+    public synchronized CloneableIterator<Row.Entry> rows() {
+        return new rowIterator(true, null);
+    }
+    
    public class rowIterator implements CloneableIterator<Row.Entry> {

        private final boolean up;
--- a/source/de/anomic/kelondro/order/StackIterator.java
+++ b/source/de/anomic/kelondro/order/StackIterator.java
@ -0,0 +1,105 @@
+// StartIterator.java
+// --------------------------
+// part of The Kelondro Database
+// (C) by Michael Peter Christen; mc@yacy.net
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2009
+// last major change: 23.02.2009
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package de.anomic.kelondro.order;
+
+import java.util.Collection;
+import java.util.ConcurrentModificationException;
+import java.util.Iterator;
+
+
+public class StackIterator<E> implements CloneableIterator<E> {
+    
+    private CloneableIterator<E> a, b;
+    private E na, nb;
+    
+    public StackIterator(
+            final CloneableIterator<E> a,
+            final CloneableIterator<E> b) {
+        // this works currently only for String-type key iterations
+        this.a = a;
+        this.b = b;
+        nexta();
+        nextb();
+    }
+
+    public StackIterator<E> clone(final Object modifier) {
+        return new StackIterator<E>(a.clone(modifier), b.clone(modifier));
+    }
+    
+    private void nexta() {
+        try {
+            if ((a != null) && (a.hasNext())) na = a.next(); else na = null;
+        } catch (final ConcurrentModificationException e) {
+            na = null;
+        }
+    }
+    private void nextb() {
+        try {
+            if ((b != null) && (b.hasNext())) nb = b.next(); else nb = null;
+        } catch (final ConcurrentModificationException e) {
+            nb = null;
+        }
+    }
+    
+    public boolean hasNext() {
+        return (na != null) || (nb != null);
+    }
+    
+    public E next() {
+        E s;
+        if (na == null) {
+            s = nb;
+            nextb();
+            return s;
+        }
+        if (nb == null) {
+            s = na;
+            nexta();
+            return s;
+        }
+        // just stack the Objects
+        s = na;
+        nexta();
+        return s;
+    }
+    
+    public void remove() {
+        throw new java.lang.UnsupportedOperationException("merge does not support remove");
+    }
+    
+    public static <A> CloneableIterator<A> stack(final Collection<CloneableIterator<A>> iterators) {
+        // this extends the ability to combine two iterators
+        // to the ability of combining a set of iterators
+        if (iterators == null) return null;
+        if (iterators.size() == 0) return null;
+        return stack(iterators.iterator());
+    }
+    
+    private static <A> CloneableIterator<A> stack(final Iterator<CloneableIterator<A>> iiterators) {
+        if (iiterators == null) return null;
+        if (!(iiterators.hasNext())) return null;
+        final CloneableIterator<A> one = iiterators.next();
+        if (!(iiterators.hasNext())) return one;
+        return new StackIterator<A>(one, stack(iiterators));
+    }
+}
--- a/source/de/anomic/kelondro/table/EcoTable.java
+++ b/source/de/anomic/kelondro/table/EcoTable.java
@ -79,11 +79,12 @@ public class EcoTable implements ObjectIndex {
    BufferedEcoFS file;
    Row rowdef;
    int fail;
-
+    File tablefile;
    Row taildef;
    private final int buffersize;
    
    public EcoTable(final File tablefile, final Row rowdef, final int useTailCache, final int buffersize, final int initialSpace) {
+        this.tablefile = tablefile;
        this.rowdef = rowdef;
        this.buffersize = buffersize;
        //this.fail = 0;
@ -574,6 +575,40 @@ public class EcoTable implements ObjectIndex {
        return index.size();
    }

+    public synchronized CloneableIterator<Entry> rows() throws IOException {
+        return new rowIteratorNoOrder();
+    }
+
+    public class rowIteratorNoOrder implements CloneableIterator<Entry> {
+        final Iterator<byte[]> ri;
+        
+        public rowIteratorNoOrder() throws IOException {
+            ri = new ChunkIterator(tablefile, rowdef.objectsize, rowdef.objectsize);
+        }
+        
+        public CloneableIterator<Entry> clone(Object modifier) {
+            try {
+                return new rowIteratorNoOrder();
+            } catch (IOException e) {
+                e.printStackTrace();
+                return null;
+            }
+        }
+        
+        public boolean hasNext() {
+            return ri.hasNext();
+        }
+        
+        public Entry next() {
+            byte[] r = ri.next();
+            return rowdef.newEntry(r);
+        }
+        
+        public void remove() {
+            throw new UnsupportedOperationException("no remove in row iterator");
+        }
+        
+    }

    public synchronized CloneableIterator<Entry> rows(final boolean up, final byte[] firstKey) throws IOException {
        return new rowIterator(up, firstKey);
--- a/source/de/anomic/kelondro/table/FlexTable.java
+++ b/source/de/anomic/kelondro/table/FlexTable.java
@ -345,10 +345,14 @@ public class FlexTable extends FlexWidthArray implements ObjectIndex {
    	return index.keys(up, firstKey);
    }
    
+    public synchronized CloneableIterator<Row.Entry> rows() throws IOException {
+        return new rowIterator(true, null);
+    }
+    
    public synchronized CloneableIterator<Row.Entry> rows(final boolean up, final byte[] firstKey) throws IOException {
        if (index == null) return new rowIterator(up, firstKey);
        assert this.size() == index.size() : "content.size() = " + this.size() + ", index.size() = " + index.size();
-		return new rowIterator(up, firstKey);
+        return new rowIterator(up, firstKey);
    }
    
    public class rowIterator implements CloneableIterator<Row.Entry> {
--- a/source/de/anomic/kelondro/table/SQLTable.java
+++ b/source/de/anomic/kelondro/table/SQLTable.java
@ -258,6 +258,10 @@ public class SQLTable implements ObjectIndex {
        return null;
    }

+    public CloneableIterator<Row.Entry> rows() throws IOException {
+        return null;
+    }
+
    public CloneableIterator<byte[]> keys(final boolean up, final byte[] startKey) {
        // Objects are of type byte[]
        return null;
--- a/source/de/anomic/kelondro/table/SplitTable.java
+++ b/source/de/anomic/kelondro/table/SplitTable.java
@ -56,6 +56,7 @@ import de.anomic.kelondro.order.CloneableIterator;
 import de.anomic.kelondro.order.NaturalOrder;
 import de.anomic.kelondro.order.MergeIterator;
 import de.anomic.kelondro.order.Order;
+import de.anomic.kelondro.order.StackIterator;
 import de.anomic.kelondro.util.Log;
 import de.anomic.kelondro.util.NamePrefixThreadFactory;

@ -380,6 +381,15 @@ public class SplitTable implements ObjectIndex {
        }
        return MergeIterator.cascade(c, entryOrder, MergeIterator.simpleMerge, up);
    }
+    
+    public synchronized CloneableIterator<Row.Entry> rows() throws IOException {
+        final List<CloneableIterator<Row.Entry>> c = new ArrayList<CloneableIterator<Row.Entry>>(tables.size());
+        final Iterator<ObjectIndex> i = tables.values().iterator();
+        while (i.hasNext()) {
+            c.add(i.next().rows());
+        }
+        return StackIterator.stack(c);
+    }

    public final int cacheObjectChunkSize() {
        // dummy method
--- a/source/de/anomic/kelondro/table/Tree.java
+++ b/source/de/anomic/kelondro/table/Tree.java
@ -1007,6 +1007,10 @@ public class Tree extends CachedRecords implements ObjectIndex {
        return new rowIterator(up, firstKey, this.size());
    }
    
+    public CloneableIterator<Row.Entry> rows() throws IOException {
+        return new rowIterator(true, null, this.size());
+    }
+    
    public class rowIterator implements CloneableIterator<Row.Entry> {
        
        int chunkSize;
@ -1615,7 +1619,7 @@ public class Tree extends CachedRecords implements ObjectIndex {
    public static int countElements(final ObjectIndex t) {
        int count = 0;
        try {
-            final Iterator<Row.Entry> iter = t.rows(true, null);
+            final Iterator<Row.Entry> iter = t.rows();
            Row.Entry row;
            while (iter.hasNext()) {
                count++;
--- a/source/de/anomic/yacy/yacyNewsDB.java
+++ b/source/de/anomic/yacy/yacyNewsDB.java
@ -110,7 +110,7 @@ public class yacyNewsDB {
        Iterator<Row.Entry> rowIterator;

        public recordIterator() throws IOException {
-            rowIterator = news.rows(true, null);
+            rowIterator = news.rows();
        }

        public boolean hasNext() {