From be4c4589514c8354f98f75ace4c385e849037b10 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 2 Jan 2009 11:38:20 +0000 Subject: [PATCH] refactoring (implemented Iterable in kelondroRowCollection) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5432 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/index/indexContainer.java | 2 +- .../de/anomic/index/indexContainerHeap.java | 1 + source/de/anomic/index/indexRAMRI.java | 2 + .../anomic/kelondro/kelondroBytesIntMap.java | 7 +- .../anomic/kelondro/kelondroBytesLongMap.java | 6 +- .../kelondro/kelondroCollectionIndex.java | 37 ++------ .../kelondro/kelondroRowCollection.java | 91 +++++++++++++++++-- source/de/anomic/kelondro/kelondroRowSet.java | 8 +- .../anomic/plasma/plasmaRankingCRProcess.java | 6 +- source/de/anomic/plasma/plasmaWordIndex.java | 2 +- source/de/anomic/server/serverFileUtils.java | 2 +- 11 files changed, 108 insertions(+), 56 deletions(-) diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java index e642e8086..b7af32abf 100644 --- a/source/de/anomic/index/indexContainer.java +++ b/source/de/anomic/index/indexContainer.java @@ -175,7 +175,7 @@ public class indexContainer extends kelondroRowSet { Iterator rowEntryIterator; public entryIterator() { - rowEntryIterator = rows(); + rowEntryIterator = iterator(); } public boolean hasNext() { diff --git a/source/de/anomic/index/indexContainerHeap.java b/source/de/anomic/index/indexContainerHeap.java index 76ba034f0..22bdb9bfc 100755 --- a/source/de/anomic/index/indexContainerHeap.java +++ b/source/de/anomic/index/indexContainerHeap.java @@ -117,6 +117,7 @@ public final class indexContainerHeap { for (final indexContainer container : new blobFileEntries(blobFile, this.payloadrow)) { // TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low? if (container == null) break; + //System.out.println("***DEBUG indexContainerHeap.initwriteModeFromBLOB*** container.size = " + container.size() + ", container.sorted = " + container.sorted()); cache.put(container.getWordHash(), container); urlCount += container.size(); } diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index ae86f4927..8d22e4d0e 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -193,6 +193,7 @@ public final class indexRAMRI implements indexRI, indexRIReader { // - the entry with maximum count if (heap.size() == 0) return null; try { + //return hashScore.getMaxObject(); String hash = null; final int count = hashScore.getMaxScore(); if ((count >= cacheReferenceCountLimit) && @@ -220,6 +221,7 @@ public final class indexRAMRI implements indexRI, indexRIReader { if (ic != null) hash = ic.getWordHash(); } return hash; + } catch (final Exception e) { log.logSevere("flushFromMem: " + e.getMessage(), e); } diff --git a/source/de/anomic/kelondro/kelondroBytesIntMap.java b/source/de/anomic/kelondro/kelondroBytesIntMap.java index d5a10ccfd..e1b15004a 100644 --- a/source/de/anomic/kelondro/kelondroBytesIntMap.java +++ b/source/de/anomic/kelondro/kelondroBytesIntMap.java @@ -26,7 +26,6 @@ package de.anomic.kelondro; import java.io.IOException; import java.util.ArrayList; -import java.util.Iterator; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; @@ -88,15 +87,13 @@ public class kelondroBytesIntMap { public synchronized ArrayList removeDoubles() throws IOException { final ArrayList report = new ArrayList(); Integer[] is; - Iterator ei; int c, i; final int initialSize = this.size(); for (final kelondroRowCollection delset: index.removeDoubles()) { is = new Integer[delset.size()]; - ei = delset.rows(); c = 0; - while (ei.hasNext()) { - i = (int) ei.next().getColLong(1); + for (kelondroRow.Entry e : delset) { + i = (int) e.getColLong(1); assert i < initialSize : "i = " + i + ", initialSize = " + initialSize; is[c++] = Integer.valueOf(i); } diff --git a/source/de/anomic/kelondro/kelondroBytesLongMap.java b/source/de/anomic/kelondro/kelondroBytesLongMap.java index 6ceb67f6a..983205d52 100644 --- a/source/de/anomic/kelondro/kelondroBytesLongMap.java +++ b/source/de/anomic/kelondro/kelondroBytesLongMap.java @@ -143,14 +143,12 @@ public class kelondroBytesLongMap { final ArrayList indexreport = index.removeDoubles(); final ArrayList report = new ArrayList(); Long[] is; - Iterator ei; int c; for (final kelondroRowCollection rowset: indexreport) { is = new Long[rowset.size()]; - ei = rowset.rows(); c = 0; - while (ei.hasNext()) { - is[c++] = Long.valueOf(ei.next().getColLong(1)); + for (kelondroRow.Entry e: rowset) { + is[c++] = Long.valueOf(e.getColLong(1)); } report.add(is); } diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index 74a037d0f..818e472f5 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -238,17 +238,14 @@ public class kelondroCollectionIndex { } } // care for double entries - Iterator rowiter; int partition, maxpartition; - kelondroRow.Entry entry, maxentry; + kelondroRow.Entry maxentry; int doublecount = 0; for (final kelondroRowCollection doubleset: index.removeDoubles()) { // for each entry in doubleset choose one which we want to keep - rowiter = doubleset.rows(); maxentry = null; maxpartition = -1; - while (rowiter.hasNext()) { - entry = rowiter.next(); + for (kelondroRow.Entry entry: doubleset) { partition = (int) entry.getColLong(idx_col_clusteridx); if (partition > maxpartition) { maxpartition = partition; @@ -506,7 +503,7 @@ public class kelondroCollectionIndex { } else { // merge with the old collection // attention! this modifies the indexrow entry which must be written with index.put(indexrow) afterwards! - final kelondroRowCollection collection = container; + kelondroRowCollection collection = container; // read old information final int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration @@ -518,13 +515,15 @@ public class kelondroCollectionIndex { // load the old collection and join it try { - collection.addAllUnique(getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false)); + kelondroRowCollection krc = getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false); + //System.out.println("***DEBUG kelondroCollectionIndex.merge before merge*** krc.size = " + krc.size() + ", krc.sortbound = " + krc.sortBound + ", collection.size = " + collection.size() + ", collection.sortbound = " + collection.sortBound); + collection = collection.merge(krc); + //System.out.println("***DEBUG kelondroCollectionIndex.merge after merge*** collection.size = " + collection.size() + ", collection.sortbound = " + collection.sortBound); + } catch (kelondroException e) { // an error like "array does not contain expected row" may appear here. Just go on like if the collection does not exist e.printStackTrace(); } - collection.sort(); - collection.uniq(); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries collection.trim(false); // check for size of collection: @@ -564,7 +563,6 @@ public class kelondroCollectionIndex { } private void shrinkCollection(final byte[] key, final kelondroRowCollection collection, final int targetSize) { - //TODO Remove timing before release // removes entries from collection // the removed entries are stored in a 'commons' dump file @@ -573,52 +571,37 @@ public class kelondroCollectionIndex { final int oldsize = collection.size(); if (oldsize <= targetSize) return; final kelondroRowSet newcommon = new kelondroRowSet(collection.rowdef, 0); - long sadd1 = 0, srem1 = 0, sadd2 = 0, srem2 = 0, tot1 = 0, tot2 = 0; - long t1 = 0, t2 = 0; // delete some entries, which are bad rated - Iterator i = collection.rows(); + Iterator i = collection.iterator(); kelondroRow.Entry entry; byte[] ref; - t1 = System.currentTimeMillis(); while (i.hasNext()) { entry = i.next(); ref = entry.getColBytes(0); if ((ref.length != 12) || (!yacyURL.probablyRootURL(new String(ref)))) { - t2 = System.currentTimeMillis(); newcommon.addUnique(entry); - sadd1 += System.currentTimeMillis() - t2; - t2 = System.currentTimeMillis(); i.remove(); - srem1 += System.currentTimeMillis() - t2; } } final int firstnewcommon = newcommon.size(); - tot1 = System.currentTimeMillis() - t1; // check if we shrinked enough final Random rand = new Random(System.currentTimeMillis()); - t1 = System.currentTimeMillis(); while (collection.size() > targetSize) { // now delete randomly more entries from the survival collection - i = collection.rows(); + i = collection.iterator(); while (i.hasNext()) { entry = i.next(); ref = entry.getColBytes(0); if (rand.nextInt() % 4 != 0) { - t2 = System.currentTimeMillis(); newcommon.addUnique(entry); - sadd2 += System.currentTimeMillis() - t2; - t2 = System.currentTimeMillis(); i.remove(); - srem2 += System.currentTimeMillis() - t2; } } } - tot2 = System.currentTimeMillis() - t1; collection.trim(false); - serverLog.logFine("kelondroCollectionIndex", "tot= "+tot1+'/'+tot2+" # add/rem(1)= "+sadd1+'/'+srem1+" # add/rem(2)= "+sadd2+'/'+srem2); serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon); // finally dump the removed entries to a file diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java index 4719bae36..f0792534f 100644 --- a/source/de/anomic/kelondro/kelondroRowCollection.java +++ b/source/de/anomic/kelondro/kelondroRowCollection.java @@ -44,7 +44,7 @@ import de.anomic.server.serverProcessor; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; -public class kelondroRowCollection { +public class kelondroRowCollection implements Iterable { public static final double growfactor = 1.4; private static final int isortlimit = 20; @@ -284,6 +284,7 @@ public class kelondroRowCollection { } public synchronized void add(final byte[] a) { + assert a.length == this.rowdef.objectsize; addUnique(a, 0, a.length); } @@ -293,6 +294,7 @@ public class kelondroRowCollection { assert (!(serverLog.allZero(a, astart, alength))) : "a = " + serverLog.arrayList(a, astart, alength); assert (alength > 0); assert (astart + alength <= a.length); + assert alength == rowdef.objectsize : "alength =" + alength + ", rowdef.objectsize = " + rowdef.objectsize; final int l = Math.min(rowdef.objectsize, Math.min(alength, a.length - astart)); ensureSize(chunkcount + 1); System.arraycopy(a, astart, chunkcache, rowdef.objectsize * chunkcount, l); @@ -310,6 +312,21 @@ public class kelondroRowCollection { this.lastTimeWrote = System.currentTimeMillis(); } + private final void addSorted(final byte[] a, final int astart, final int alength) { + assert (a != null); + assert (astart >= 0) && (astart < a.length) : " astart = " + astart; + assert (!(serverLog.allZero(a, astart, alength))) : "a = " + serverLog.arrayList(a, astart, alength); + assert (alength > 0); + assert (astart + alength <= a.length); + assert alength == rowdef.objectsize : "alength =" + alength + ", rowdef.objectsize = " + rowdef.objectsize; + final int l = Math.min(rowdef.objectsize, Math.min(alength, a.length - astart)); + ensureSize(chunkcount + 1); + System.arraycopy(a, astart, chunkcache, rowdef.objectsize * chunkcount, l); + this.chunkcount++; + this.sortBound = this.chunkcount; + this.lastTimeWrote = System.currentTimeMillis(); + } + public synchronized final void addAllUnique(final kelondroRowCollection c) { if (c == null) return; assert(rowdef.objectsize == c.rowdef.objectsize); @@ -379,7 +396,11 @@ public class kelondroRowCollection { } public int size() { - return chunkcount; + return this.chunkcount; + } + + public int sorted() { + return this.sortBound; } public synchronized Iterator keys() { @@ -413,9 +434,12 @@ public class kelondroRowCollection { p--; removeRow(p, false); } - } - - public synchronized Iterator rows() { + } + + /** + * return an iterator for the row entries in this object + */ + public Iterator iterator() { // iterates kelondroRow.Entry - type entries return new rowIterator(); } @@ -446,12 +470,13 @@ public class kelondroRowCollection { p--; removeRow(p, false); } + } public synchronized void select(final Set keys) { // removes all entries but the ones given by urlselection if ((keys == null) || (keys.isEmpty())) return; - final Iterator i = rows(); + final Iterator i = iterator(); kelondroRow.Entry row; while (i.hasNext()) { row = i.next(); @@ -813,9 +838,59 @@ public class kelondroRowCollection { return true; } + /** + * merge this row collection with another row collection. + * the current collection is not altered in any way, the returned collection is a new collection with copied content. + * The resulting collection is sorted and does not contain any doubles, which are also removed during the merge + * @param c + * @return + */ + public kelondroRowCollection merge(kelondroRowCollection c) { + assert this.rowdef == c.rowdef; + kelondroRowCollection r = new kelondroRowCollection(this.rowdef, this.size() + c.size()); + this.sort(); + c.sort(); + int ti = 0, ci = 0; + int tp, cp; + int o; + final int pkl = this.rowdef.primaryKeyLength; + while (ti < this.size() && ci < c.size()) { + tp = ti * this.rowdef.objectsize; + cp = ci * this.rowdef.objectsize; + o = this.rowdef.objectOrder.compare(this.chunkcache, tp, pkl, c.chunkcache, cp, pkl); + if (o == 0) { + r.addSorted(this.chunkcache, tp, this.rowdef.objectsize); + ti++; + ci++; + continue; + } + if (o < 0) { + r.addSorted(this.chunkcache, tp, this.rowdef.objectsize); + ti++; + continue; + } + if (o > 0) { + r.addSorted(c.chunkcache, cp, this.rowdef.objectsize); + ci++; + continue; + } + } + while (ti < this.size()) { + tp = ti * this.rowdef.objectsize; + r.addSorted(this.chunkcache, tp, this.rowdef.objectsize); + ti++; + } + while (ci < c.size()) { + cp = ci * this.rowdef.objectsize; + r.addSorted(c.chunkcache, cp, this.rowdef.objectsize); + ci++; + } + return r; + } + public synchronized String toString() { final StringBuilder s = new StringBuilder(); - final Iterator i = rows(); + final Iterator i = iterator(); if (i.hasNext()) s.append(i.next().toString()); while (i.hasNext()) s.append(", " + (i.next()).toString()); return new String(s); @@ -919,7 +994,7 @@ public class kelondroRowCollection { a.add("CCCCCCCCCCCC".getBytes()); final ArrayList del = a.removeDoubles(); System.out.println(del + "rows double"); - final Iterator j = a.rows(); + final Iterator j = a.iterator(); while (j.hasNext()) System.out.println(new String(j.next().bytes())); System.out.println("kelondroRowCollection test with size = " + testsize); diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index e839e0256..54fc543c8 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -33,7 +33,7 @@ import java.util.Random; import de.anomic.server.logging.serverLog; -public class kelondroRowSet extends kelondroRowCollection implements kelondroIndex { +public class kelondroRowSet extends kelondroRowCollection implements kelondroIndex, Iterable { private static final int collectionReSortLimit = 400; @@ -337,10 +337,10 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd } } - public synchronized Iterator rows() { + public synchronized Iterator iterator() { // iterates kelondroRow.Entry - type entries sort(); - return super.rows(); + return super.iterator(); } public synchronized kelondroCloneableIterator rows(final boolean up, final byte[] firstKey) { @@ -420,7 +420,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes()); d.sort(); d.remove("fuenf".getBytes(), 0, 5); - final Iterator ii = d.rows(); + final Iterator ii = d.iterator(); String s; System.out.print("INPUT-ITERATOR: "); kelondroRow.Entry entry; diff --git a/source/de/anomic/plasma/plasmaRankingCRProcess.java b/source/de/anomic/plasma/plasmaRankingCRProcess.java index b847c0b80..c0e4fc010 100644 --- a/source/de/anomic/plasma/plasmaRankingCRProcess.java +++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java @@ -391,11 +391,7 @@ public class plasmaRankingCRProcess { cr_entry = (kelondroRowSet) keycollection[1]; // loop over all anchors - final Iterator j = cr_entry.rows(); - kelondroRow.Entry entry; - while (j.hasNext()) { - // get domain of anchors - entry = j.next(); + for (kelondroRow.Entry entry: cr_entry) { anchor = entry.getColString(0, null); if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 023b28076..624bd6be3 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -472,7 +472,7 @@ public final class plasmaWordIndex implements indexRI { // To ensure termination an additional counter is used int l = 0; while ((l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) { - flushCache(theCache, Math.min(10, theCache.size())); + flushCache(theCache, Math.min(20, theCache.size())); } // next flush more entries if the size exceeds the maximum size of the cache if ((theCache.size() > theCache.getMaxWordCount()) || diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index 16b24b831..3960850ad 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -463,7 +463,7 @@ public final class serverFileUtils { os = zos; } if(os != null) { - final Iterator i = set.rows(); + final Iterator i = set.iterator(); String key; if (i.hasNext()) { key = new String(i.next().getColBytes(0));