From be4c4589514c8354f98f75ace4c385e849037b10 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Fri, 2 Jan 2009 11:38:20 +0000
Subject: [PATCH] refactoring (implemented Iterable in kelondroRowCollection)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5432 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 source/de/anomic/index/indexContainer.java    |  2 +-
 .../de/anomic/index/indexContainerHeap.java   |  1 +
 source/de/anomic/index/indexRAMRI.java        |  2 +
 .../anomic/kelondro/kelondroBytesIntMap.java  |  7 +-
 .../anomic/kelondro/kelondroBytesLongMap.java |  6 +-
 .../kelondro/kelondroCollectionIndex.java     | 37 ++------
 .../kelondro/kelondroRowCollection.java       | 91 +++++++++++++++++--
 source/de/anomic/kelondro/kelondroRowSet.java |  8 +-
 .../anomic/plasma/plasmaRankingCRProcess.java |  6 +-
 source/de/anomic/plasma/plasmaWordIndex.java  |  2 +-
 source/de/anomic/server/serverFileUtils.java  |  2 +-
 11 files changed, 108 insertions(+), 56 deletions(-)

diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java
index e642e8086..b7af32abf 100644
--- a/source/de/anomic/index/indexContainer.java
+++ b/source/de/anomic/index/indexContainer.java
@@ -175,7 +175,7 @@ public class indexContainer extends kelondroRowSet {
         Iterator<kelondroRow.Entry> rowEntryIterator;
         
         public entryIterator() {
-            rowEntryIterator = rows();
+            rowEntryIterator = iterator();
         }
         
         public boolean hasNext() {
diff --git a/source/de/anomic/index/indexContainerHeap.java b/source/de/anomic/index/indexContainerHeap.java
index 76ba034f0..22bdb9bfc 100755
--- a/source/de/anomic/index/indexContainerHeap.java
+++ b/source/de/anomic/index/indexContainerHeap.java
@@ -117,6 +117,7 @@ public final class indexContainerHeap {
             for (final indexContainer container : new blobFileEntries(blobFile, this.payloadrow)) {
                 // TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
                 if (container == null) break;
+                //System.out.println("***DEBUG indexContainerHeap.initwriteModeFromBLOB*** container.size = " + container.size() + ", container.sorted = " + container.sorted());
                 cache.put(container.getWordHash(), container);
                 urlCount += container.size();
             }
diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java
index ae86f4927..8d22e4d0e 100644
--- a/source/de/anomic/index/indexRAMRI.java
+++ b/source/de/anomic/index/indexRAMRI.java
@@ -193,6 +193,7 @@ public final class indexRAMRI implements indexRI, indexRIReader {
         // - the entry with maximum count
         if (heap.size() == 0) return null;
         try {
+            //return hashScore.getMaxObject();
             String hash = null;
             final int count = hashScore.getMaxScore();
             if ((count >= cacheReferenceCountLimit) &&
@@ -220,6 +221,7 @@ public final class indexRAMRI implements indexRI, indexRIReader {
                 if (ic != null) hash = ic.getWordHash();
             }
             return hash;
+            
         } catch (final Exception e) {
             log.logSevere("flushFromMem: " + e.getMessage(), e);
         }
diff --git a/source/de/anomic/kelondro/kelondroBytesIntMap.java b/source/de/anomic/kelondro/kelondroBytesIntMap.java
index d5a10ccfd..e1b15004a 100644
--- a/source/de/anomic/kelondro/kelondroBytesIntMap.java
+++ b/source/de/anomic/kelondro/kelondroBytesIntMap.java
@@ -26,7 +26,6 @@ package de.anomic.kelondro;
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Iterator;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.Callable;
@@ -88,15 +87,13 @@ public class kelondroBytesIntMap {
     public synchronized ArrayList<Integer[]> removeDoubles() throws IOException {
         final ArrayList<Integer[]> report = new ArrayList<Integer[]>();
         Integer[] is;
-        Iterator<kelondroRow.Entry> ei;
         int c, i;
         final int initialSize = this.size();
         for (final kelondroRowCollection delset: index.removeDoubles()) {
             is = new Integer[delset.size()];
-            ei = delset.rows();
             c = 0;
-            while (ei.hasNext()) {
-                i = (int) ei.next().getColLong(1);
+            for (kelondroRow.Entry e : delset) {
+                i = (int) e.getColLong(1);
                 assert i < initialSize : "i = " + i + ", initialSize = " + initialSize;
                 is[c++] = Integer.valueOf(i);
             }
diff --git a/source/de/anomic/kelondro/kelondroBytesLongMap.java b/source/de/anomic/kelondro/kelondroBytesLongMap.java
index 6ceb67f6a..983205d52 100644
--- a/source/de/anomic/kelondro/kelondroBytesLongMap.java
+++ b/source/de/anomic/kelondro/kelondroBytesLongMap.java
@@ -143,14 +143,12 @@ public class kelondroBytesLongMap {
         final ArrayList<kelondroRowCollection> indexreport = index.removeDoubles();
         final ArrayList<Long[]> report = new ArrayList<Long[]>();
         Long[] is;
-        Iterator<kelondroRow.Entry> ei;
         int c;
         for (final kelondroRowCollection rowset: indexreport) {
             is = new Long[rowset.size()];
-            ei = rowset.rows();
             c = 0;
-            while (ei.hasNext()) {
-                is[c++] = Long.valueOf(ei.next().getColLong(1));
+            for (kelondroRow.Entry e: rowset) {
+                is[c++] = Long.valueOf(e.getColLong(1));
             }
             report.add(is);
         }
diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java
index 74a037d0f..818e472f5 100644
--- a/source/de/anomic/kelondro/kelondroCollectionIndex.java
+++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java
@@ -238,17 +238,14 @@ public class kelondroCollectionIndex {
             }
         }
         // care for double entries
-        Iterator<kelondroRow.Entry> rowiter;
         int partition, maxpartition;
-        kelondroRow.Entry entry, maxentry;
+        kelondroRow.Entry maxentry;
         int doublecount = 0;
         for (final kelondroRowCollection doubleset: index.removeDoubles()) {
             // for each entry in doubleset choose one which we want to keep
-            rowiter = doubleset.rows();
             maxentry = null;
             maxpartition = -1;
-            while (rowiter.hasNext()) {
-                entry = rowiter.next();
+            for (kelondroRow.Entry entry: doubleset) {
                 partition = (int) entry.getColLong(idx_col_clusteridx);
                 if (partition > maxpartition) {
                     maxpartition = partition;
@@ -506,7 +503,7 @@ public class kelondroCollectionIndex {
         } else {
             // merge with the old collection
             // attention! this modifies the indexrow entry which must be written with index.put(indexrow) afterwards!
-            final kelondroRowCollection collection = container;
+            kelondroRowCollection collection = container;
             
             // read old information
             final int oldchunksize       = (int) indexrow.getColLong(idx_col_chunksize);  // needed only for migration
@@ -518,13 +515,15 @@ public class kelondroCollectionIndex {
 
             // load the old collection and join it
             try {
-                collection.addAllUnique(getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false));
+                kelondroRowCollection krc = getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false);
+                //System.out.println("***DEBUG kelondroCollectionIndex.merge before merge*** krc.size = " + krc.size() + ", krc.sortbound = " + krc.sortBound + ", collection.size = " + collection.size() + ", collection.sortbound = " + collection.sortBound);
+                collection = collection.merge(krc);
+                //System.out.println("***DEBUG kelondroCollectionIndex.merge  after merge*** collection.size = " + collection.size() + ", collection.sortbound = " + collection.sortBound);
+                
             } catch (kelondroException e) {
                 // an error like "array does not contain expected row" may appear here. Just go on like if the collection does not exist
                 e.printStackTrace();
             }
-            collection.sort();
-            collection.uniq(); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries
             collection.trim(false);
             
             // check for size of collection:
@@ -564,7 +563,6 @@ public class kelondroCollectionIndex {
     }
     
     private void shrinkCollection(final byte[] key, final kelondroRowCollection collection, final int targetSize) {
-        //TODO Remove timing before release
         // removes entries from collection
         // the removed entries are stored in a 'commons' dump file
 
@@ -573,52 +571,37 @@ public class kelondroCollectionIndex {
         final int oldsize = collection.size();
         if (oldsize <= targetSize) return;
         final kelondroRowSet newcommon = new kelondroRowSet(collection.rowdef, 0);
-        long sadd1 = 0, srem1 = 0, sadd2 = 0, srem2 = 0, tot1 = 0, tot2 = 0;
-        long t1 = 0, t2 = 0;
         
         // delete some entries, which are bad rated
-        Iterator<kelondroRow.Entry> i = collection.rows();
+        Iterator<kelondroRow.Entry> i = collection.iterator();
         kelondroRow.Entry entry;
         byte[] ref;
-        t1 = System.currentTimeMillis();
         while (i.hasNext()) {
             entry = i.next();
             ref = entry.getColBytes(0);
             if ((ref.length != 12) || (!yacyURL.probablyRootURL(new String(ref)))) {
-                t2 = System.currentTimeMillis();
                 newcommon.addUnique(entry);
-                sadd1 += System.currentTimeMillis() - t2;
-                t2 = System.currentTimeMillis();
                 i.remove();
-                srem1 += System.currentTimeMillis() - t2;
             }
         }
         final int firstnewcommon = newcommon.size();
-        tot1 = System.currentTimeMillis() - t1;
         
         // check if we shrinked enough
         final Random rand = new Random(System.currentTimeMillis());
-        t1 = System.currentTimeMillis();
         while (collection.size() > targetSize) {
             // now delete randomly more entries from the survival collection
-            i = collection.rows();
+            i = collection.iterator();
             while (i.hasNext()) {
                 entry = i.next();
                 ref = entry.getColBytes(0);
                 if (rand.nextInt() % 4 != 0) {
-                    t2 = System.currentTimeMillis();
                     newcommon.addUnique(entry);
-                    sadd2 += System.currentTimeMillis() - t2;
-                    t2 = System.currentTimeMillis();
                     i.remove();
-                    srem2 += System.currentTimeMillis() - t2;
                 }
             }
         }
-        tot2 = System.currentTimeMillis() - t1;
         collection.trim(false);
         
-        serverLog.logFine("kelondroCollectionIndex", "tot= "+tot1+'/'+tot2+" # add/rem(1)= "+sadd1+'/'+srem1+" # add/rem(2)= "+sadd2+'/'+srem2);
         serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon);
         
         // finally dump the removed entries to a file
diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java
index 4719bae36..f0792534f 100644
--- a/source/de/anomic/kelondro/kelondroRowCollection.java
+++ b/source/de/anomic/kelondro/kelondroRowCollection.java
@@ -44,7 +44,7 @@ import de.anomic.server.serverProcessor;
 import de.anomic.server.logging.serverLog;
 import de.anomic.yacy.yacySeedDB;
 
-public class kelondroRowCollection {
+public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
 
     public  static final double growfactor = 1.4;
     private static final int isortlimit = 20;
@@ -284,6 +284,7 @@ public class kelondroRowCollection {
     }
     
     public synchronized void add(final byte[] a) {
+        assert a.length == this.rowdef.objectsize;
         addUnique(a, 0, a.length);
     }
     
@@ -293,6 +294,7 @@ public class kelondroRowCollection {
         assert (!(serverLog.allZero(a, astart, alength))) : "a = " + serverLog.arrayList(a, astart, alength);
         assert (alength > 0);
         assert (astart + alength <= a.length);
+        assert alength == rowdef.objectsize : "alength =" + alength + ", rowdef.objectsize = " + rowdef.objectsize;
         final int l = Math.min(rowdef.objectsize, Math.min(alength, a.length - astart));
         ensureSize(chunkcount + 1);
         System.arraycopy(a, astart, chunkcache, rowdef.objectsize * chunkcount, l);
@@ -310,6 +312,21 @@ public class kelondroRowCollection {
         this.lastTimeWrote = System.currentTimeMillis();
     }
     
+    private final void addSorted(final byte[] a, final int astart, final int alength) {
+        assert (a != null);
+        assert (astart >= 0) && (astart < a.length) : " astart = " + astart;
+        assert (!(serverLog.allZero(a, astart, alength))) : "a = " + serverLog.arrayList(a, astart, alength);
+        assert (alength > 0);
+        assert (astart + alength <= a.length);
+        assert alength == rowdef.objectsize : "alength =" + alength + ", rowdef.objectsize = " + rowdef.objectsize;
+        final int l = Math.min(rowdef.objectsize, Math.min(alength, a.length - astart));
+        ensureSize(chunkcount + 1);
+        System.arraycopy(a, astart, chunkcache, rowdef.objectsize * chunkcount, l);
+        this.chunkcount++;
+        this.sortBound = this.chunkcount;
+        this.lastTimeWrote = System.currentTimeMillis();
+    }
+    
     public synchronized final void addAllUnique(final kelondroRowCollection c) {
         if (c == null) return;
         assert(rowdef.objectsize == c.rowdef.objectsize);
@@ -379,7 +396,11 @@ public class kelondroRowCollection {
     }
     
     public int size() {
-        return chunkcount;
+        return this.chunkcount;
+    }
+    
+    public int sorted() {
+        return this.sortBound;
     }
     
     public synchronized Iterator<byte[]> keys() {
@@ -413,9 +434,12 @@ public class kelondroRowCollection {
             p--;
             removeRow(p, false);
         }
-    }
-    
-    public synchronized Iterator<kelondroRow.Entry> rows() {
+    }    
+
+    /**
+     * return an iterator for the row entries in this object
+     */
+    public Iterator<kelondroRow.Entry> iterator() {
         // iterates kelondroRow.Entry - type entries
         return new rowIterator();
     }
@@ -446,12 +470,13 @@ public class kelondroRowCollection {
             p--;
             removeRow(p, false);
         }
+
     }
     
     public synchronized void select(final Set<String> keys) {
         // removes all entries but the ones given by urlselection
         if ((keys == null) || (keys.isEmpty())) return;
-        final Iterator<kelondroRow.Entry> i = rows();
+        final Iterator<kelondroRow.Entry> i = iterator();
         kelondroRow.Entry row;
         while (i.hasNext()) {
             row = i.next();
@@ -813,9 +838,59 @@ public class kelondroRowCollection {
         return true;
     }
     
+    /**
+     * merge this row collection with another row collection.
+     * the current collection is not altered in any way, the returned collection is a new collection with copied content.
+     * The resulting collection is sorted and does not contain any doubles, which are also removed during the merge
+     * @param c
+     * @return
+     */
+    public kelondroRowCollection merge(kelondroRowCollection c) {
+        assert this.rowdef == c.rowdef;
+        kelondroRowCollection r = new kelondroRowCollection(this.rowdef, this.size() + c.size());
+        this.sort();
+        c.sort();
+        int ti = 0, ci = 0;
+        int tp, cp;
+        int o;
+        final int pkl = this.rowdef.primaryKeyLength;
+        while (ti < this.size() && ci < c.size()) {
+            tp = ti * this.rowdef.objectsize;
+            cp = ci * this.rowdef.objectsize;
+            o = this.rowdef.objectOrder.compare(this.chunkcache, tp, pkl, c.chunkcache, cp, pkl);
+            if (o == 0) {
+                r.addSorted(this.chunkcache, tp, this.rowdef.objectsize);
+                ti++;
+                ci++;
+                continue;
+            }
+            if (o < 0) {
+                r.addSorted(this.chunkcache, tp, this.rowdef.objectsize);
+                ti++;
+                continue;
+            }
+            if (o > 0) {
+                r.addSorted(c.chunkcache, cp, this.rowdef.objectsize);
+                ci++;
+                continue;
+            }
+        }
+        while (ti < this.size()) {
+            tp = ti * this.rowdef.objectsize;
+            r.addSorted(this.chunkcache, tp, this.rowdef.objectsize);
+            ti++;
+        }
+        while (ci < c.size()) {
+            cp = ci * this.rowdef.objectsize;
+            r.addSorted(c.chunkcache, cp, this.rowdef.objectsize);
+            ci++;
+        }
+        return r;
+    }
+    
     public synchronized String toString() {
         final StringBuilder s = new StringBuilder();
-        final Iterator<kelondroRow.Entry> i = rows();
+        final Iterator<kelondroRow.Entry> i = iterator();
         if (i.hasNext()) s.append(i.next().toString());
         while (i.hasNext()) s.append(", " + (i.next()).toString());
         return new String(s);
@@ -919,7 +994,7 @@ public class kelondroRowCollection {
     	a.add("CCCCCCCCCCCC".getBytes());
     	final ArrayList<kelondroRowCollection> del = a.removeDoubles();
     	System.out.println(del + "rows double");
-    	final Iterator<kelondroRow.Entry> j = a.rows();
+    	final Iterator<kelondroRow.Entry> j = a.iterator();
     	while (j.hasNext()) System.out.println(new String(j.next().bytes()));
     	
         System.out.println("kelondroRowCollection test with size = " + testsize);
diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java
index e839e0256..54fc543c8 100644
--- a/source/de/anomic/kelondro/kelondroRowSet.java
+++ b/source/de/anomic/kelondro/kelondroRowSet.java
@@ -33,7 +33,7 @@ import java.util.Random;
 
 import de.anomic.server.logging.serverLog;
 
-public class kelondroRowSet extends kelondroRowCollection implements kelondroIndex {
+public class kelondroRowSet extends kelondroRowCollection implements kelondroIndex, Iterable<kelondroRow.Entry> {
 
     private static final int collectionReSortLimit = 400;
     
@@ -337,10 +337,10 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
         }
     }
     
-    public synchronized Iterator<kelondroRow.Entry> rows() {
+    public synchronized Iterator<kelondroRow.Entry> iterator() {
         // iterates kelondroRow.Entry - type entries
         sort();
-        return super.rows();
+        return super.iterator();
     }
     
     public synchronized kelondroCloneableIterator<kelondroRow.Entry> rows(final boolean up, final byte[] firstKey) {
@@ -420,7 +420,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
         for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());
         d.sort();
         d.remove("fuenf".getBytes(), 0, 5);
-        final Iterator<kelondroRow.Entry> ii = d.rows();
+        final Iterator<kelondroRow.Entry> ii = d.iterator();
         String s;
         System.out.print("INPUT-ITERATOR: ");
         kelondroRow.Entry entry;
diff --git a/source/de/anomic/plasma/plasmaRankingCRProcess.java b/source/de/anomic/plasma/plasmaRankingCRProcess.java
index b847c0b80..c0e4fc010 100644
--- a/source/de/anomic/plasma/plasmaRankingCRProcess.java
+++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java
@@ -391,11 +391,7 @@ public class plasmaRankingCRProcess {
             cr_entry = (kelondroRowSet) keycollection[1];
             
             // loop over all anchors
-            final Iterator<kelondroRow.Entry> j = cr_entry.rows();
-            kelondroRow.Entry entry;
-            while (j.hasNext()) {
-                // get domain of anchors
-                entry = j.next();
+            for (kelondroRow.Entry entry: cr_entry) {
                 anchor = entry.getColString(0, null);
                 if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6);
 
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 023b28076..624bd6be3 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -472,7 +472,7 @@ public final class plasmaWordIndex implements indexRI {
             // To ensure termination an additional counter is used
             int l = 0;
             while ((l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) {
-                flushCache(theCache, Math.min(10, theCache.size()));
+                flushCache(theCache, Math.min(20, theCache.size()));
             }
             // next flush more entries if the size exceeds the maximum size of the cache
             if ((theCache.size() > theCache.getMaxWordCount()) ||
diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java
index 16b24b831..3960850ad 100644
--- a/source/de/anomic/server/serverFileUtils.java
+++ b/source/de/anomic/server/serverFileUtils.java
@@ -463,7 +463,7 @@ public final class serverFileUtils {
             os = zos;
         }
         if(os != null) {
-            final Iterator<kelondroRow.Entry> i = set.rows();
+            final Iterator<kelondroRow.Entry> i = set.iterator();
             String key;
             if (i.hasNext()) {
                 key = new String(i.next().getColBytes(0));