refactoring (implemented Iterable in kelondroRowCollection)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5432 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent bb5c2cd12e
commit be4c458951

@ -175,7 +175,7 @@ public class indexContainer extends kelondroRowSet {
Iterator<kelondroRow.Entry> rowEntryIterator;
public entryIterator() {
rowEntryIterator = rows();
rowEntryIterator = iterator();
}
public boolean hasNext() {

@ -117,6 +117,7 @@ public final class indexContainerHeap {
for (final indexContainer container : new blobFileEntries(blobFile, this.payloadrow)) {
// TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
if (container == null) break;
//System.out.println("***DEBUG indexContainerHeap.initwriteModeFromBLOB*** container.size = " + container.size() + ", container.sorted = " + container.sorted());
cache.put(container.getWordHash(), container);
urlCount += container.size();
}

@ -193,6 +193,7 @@ public final class indexRAMRI implements indexRI, indexRIReader {
// - the entry with maximum count
if (heap.size() == 0) return null;
try {
//return hashScore.getMaxObject();
String hash = null;
final int count = hashScore.getMaxScore();
if ((count >= cacheReferenceCountLimit) &&
@ -220,6 +221,7 @@ public final class indexRAMRI implements indexRI, indexRIReader {
if (ic != null) hash = ic.getWordHash();
}
return hash;
} catch (final Exception e) {
log.logSevere("flushFromMem: " + e.getMessage(), e);
}

@ -26,7 +26,6 @@ package de.anomic.kelondro;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
@ -88,15 +87,13 @@ public class kelondroBytesIntMap {
public synchronized ArrayList<Integer[]> removeDoubles() throws IOException {
final ArrayList<Integer[]> report = new ArrayList<Integer[]>();
Integer[] is;
Iterator<kelondroRow.Entry> ei;
int c, i;
final int initialSize = this.size();
for (final kelondroRowCollection delset: index.removeDoubles()) {
is = new Integer[delset.size()];
ei = delset.rows();
c = 0;
while (ei.hasNext()) {
i = (int) ei.next().getColLong(1);
for (kelondroRow.Entry e : delset) {
i = (int) e.getColLong(1);
assert i < initialSize : "i = " + i + ", initialSize = " + initialSize;
is[c++] = Integer.valueOf(i);
}

@ -143,14 +143,12 @@ public class kelondroBytesLongMap {
final ArrayList<kelondroRowCollection> indexreport = index.removeDoubles();
final ArrayList<Long[]> report = new ArrayList<Long[]>();
Long[] is;
Iterator<kelondroRow.Entry> ei;
int c;
for (final kelondroRowCollection rowset: indexreport) {
is = new Long[rowset.size()];
ei = rowset.rows();
c = 0;
while (ei.hasNext()) {
is[c++] = Long.valueOf(ei.next().getColLong(1));
for (kelondroRow.Entry e: rowset) {
is[c++] = Long.valueOf(e.getColLong(1));
}
report.add(is);
}

@ -238,17 +238,14 @@ public class kelondroCollectionIndex {
}
}
// care for double entries
Iterator<kelondroRow.Entry> rowiter;
int partition, maxpartition;
kelondroRow.Entry entry, maxentry;
kelondroRow.Entry maxentry;
int doublecount = 0;
for (final kelondroRowCollection doubleset: index.removeDoubles()) {
// for each entry in doubleset choose one which we want to keep
rowiter = doubleset.rows();
maxentry = null;
maxpartition = -1;
while (rowiter.hasNext()) {
entry = rowiter.next();
for (kelondroRow.Entry entry: doubleset) {
partition = (int) entry.getColLong(idx_col_clusteridx);
if (partition > maxpartition) {
maxpartition = partition;
@ -506,7 +503,7 @@ public class kelondroCollectionIndex {
} else {
// merge with the old collection
// attention! this modifies the indexrow entry which must be written with index.put(indexrow) afterwards!
final kelondroRowCollection collection = container;
kelondroRowCollection collection = container;
// read old information
final int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration
@ -518,13 +515,15 @@ public class kelondroCollectionIndex {
// load the old collection and join it
try {
collection.addAllUnique(getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false));
kelondroRowCollection krc = getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false);
//System.out.println("***DEBUG kelondroCollectionIndex.merge before merge*** krc.size = " + krc.size() + ", krc.sortbound = " + krc.sortBound + ", collection.size = " + collection.size() + ", collection.sortbound = " + collection.sortBound);
collection = collection.merge(krc);
//System.out.println("***DEBUG kelondroCollectionIndex.merge after merge*** collection.size = " + collection.size() + ", collection.sortbound = " + collection.sortBound);
} catch (kelondroException e) {
// an error like "array does not contain expected row" may appear here. Just go on like if the collection does not exist
e.printStackTrace();
}
collection.sort();
collection.uniq(); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries
collection.trim(false);
// check for size of collection:
@ -564,7 +563,6 @@ public class kelondroCollectionIndex {
}
private void shrinkCollection(final byte[] key, final kelondroRowCollection collection, final int targetSize) {
//TODO Remove timing before release
// removes entries from collection
// the removed entries are stored in a 'commons' dump file
@ -573,52 +571,37 @@ public class kelondroCollectionIndex {
final int oldsize = collection.size();
if (oldsize <= targetSize) return;
final kelondroRowSet newcommon = new kelondroRowSet(collection.rowdef, 0);
long sadd1 = 0, srem1 = 0, sadd2 = 0, srem2 = 0, tot1 = 0, tot2 = 0;
long t1 = 0, t2 = 0;
// delete some entries, which are bad rated
Iterator<kelondroRow.Entry> i = collection.rows();
Iterator<kelondroRow.Entry> i = collection.iterator();
kelondroRow.Entry entry;
byte[] ref;
t1 = System.currentTimeMillis();
while (i.hasNext()) {
entry = i.next();
ref = entry.getColBytes(0);
if ((ref.length != 12) || (!yacyURL.probablyRootURL(new String(ref)))) {
t2 = System.currentTimeMillis();
newcommon.addUnique(entry);
sadd1 += System.currentTimeMillis() - t2;
t2 = System.currentTimeMillis();
i.remove();
srem1 += System.currentTimeMillis() - t2;
}
}
final int firstnewcommon = newcommon.size();
tot1 = System.currentTimeMillis() - t1;
// check if we shrinked enough
final Random rand = new Random(System.currentTimeMillis());
t1 = System.currentTimeMillis();
while (collection.size() > targetSize) {
// now delete randomly more entries from the survival collection
i = collection.rows();
i = collection.iterator();
while (i.hasNext()) {
entry = i.next();
ref = entry.getColBytes(0);
if (rand.nextInt() % 4 != 0) {
t2 = System.currentTimeMillis();
newcommon.addUnique(entry);
sadd2 += System.currentTimeMillis() - t2;
t2 = System.currentTimeMillis();
i.remove();
srem2 += System.currentTimeMillis() - t2;
}
}
}
tot2 = System.currentTimeMillis() - t1;
collection.trim(false);
serverLog.logFine("kelondroCollectionIndex", "tot= "+tot1+'/'+tot2+" # add/rem(1)= "+sadd1+'/'+srem1+" # add/rem(2)= "+sadd2+'/'+srem2);
serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon);
// finally dump the removed entries to a file

@ -44,7 +44,7 @@ import de.anomic.server.serverProcessor;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public class kelondroRowCollection {
public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
public static final double growfactor = 1.4;
private static final int isortlimit = 20;
@ -284,6 +284,7 @@ public class kelondroRowCollection {
}
public synchronized void add(final byte[] a) {
assert a.length == this.rowdef.objectsize;
addUnique(a, 0, a.length);
}
@ -293,6 +294,7 @@ public class kelondroRowCollection {
assert (!(serverLog.allZero(a, astart, alength))) : "a = " + serverLog.arrayList(a, astart, alength);
assert (alength > 0);
assert (astart + alength <= a.length);
assert alength == rowdef.objectsize : "alength =" + alength + ", rowdef.objectsize = " + rowdef.objectsize;
final int l = Math.min(rowdef.objectsize, Math.min(alength, a.length - astart));
ensureSize(chunkcount + 1);
System.arraycopy(a, astart, chunkcache, rowdef.objectsize * chunkcount, l);
@ -310,6 +312,21 @@ public class kelondroRowCollection {
this.lastTimeWrote = System.currentTimeMillis();
}
private final void addSorted(final byte[] a, final int astart, final int alength) {
assert (a != null);
assert (astart >= 0) && (astart < a.length) : " astart = " + astart;
assert (!(serverLog.allZero(a, astart, alength))) : "a = " + serverLog.arrayList(a, astart, alength);
assert (alength > 0);
assert (astart + alength <= a.length);
assert alength == rowdef.objectsize : "alength =" + alength + ", rowdef.objectsize = " + rowdef.objectsize;
final int l = Math.min(rowdef.objectsize, Math.min(alength, a.length - astart));
ensureSize(chunkcount + 1);
System.arraycopy(a, astart, chunkcache, rowdef.objectsize * chunkcount, l);
this.chunkcount++;
this.sortBound = this.chunkcount;
this.lastTimeWrote = System.currentTimeMillis();
}
public synchronized final void addAllUnique(final kelondroRowCollection c) {
if (c == null) return;
assert(rowdef.objectsize == c.rowdef.objectsize);
@ -379,7 +396,11 @@ public class kelondroRowCollection {
}
public int size() {
return chunkcount;
return this.chunkcount;
}
public int sorted() {
return this.sortBound;
}
public synchronized Iterator<byte[]> keys() {
@ -413,9 +434,12 @@ public class kelondroRowCollection {
p--;
removeRow(p, false);
}
}
public synchronized Iterator<kelondroRow.Entry> rows() {
}
/**
* return an iterator for the row entries in this object
*/
public Iterator<kelondroRow.Entry> iterator() {
// iterates kelondroRow.Entry - type entries
return new rowIterator();
}
@ -446,12 +470,13 @@ public class kelondroRowCollection {
p--;
removeRow(p, false);
}
}
public synchronized void select(final Set<String> keys) {
// removes all entries but the ones given by urlselection
if ((keys == null) || (keys.isEmpty())) return;
final Iterator<kelondroRow.Entry> i = rows();
final Iterator<kelondroRow.Entry> i = iterator();
kelondroRow.Entry row;
while (i.hasNext()) {
row = i.next();
@ -813,9 +838,59 @@ public class kelondroRowCollection {
return true;
}
/**
* merge this row collection with another row collection.
* the current collection is not altered in any way, the returned collection is a new collection with copied content.
* The resulting collection is sorted and does not contain any doubles, which are also removed during the merge
* @param c
* @return
*/
public kelondroRowCollection merge(kelondroRowCollection c) {
assert this.rowdef == c.rowdef;
kelondroRowCollection r = new kelondroRowCollection(this.rowdef, this.size() + c.size());
this.sort();
c.sort();
int ti = 0, ci = 0;
int tp, cp;
int o;
final int pkl = this.rowdef.primaryKeyLength;
while (ti < this.size() && ci < c.size()) {
tp = ti * this.rowdef.objectsize;
cp = ci * this.rowdef.objectsize;
o = this.rowdef.objectOrder.compare(this.chunkcache, tp, pkl, c.chunkcache, cp, pkl);
if (o == 0) {
r.addSorted(this.chunkcache, tp, this.rowdef.objectsize);
ti++;
ci++;
continue;
}
if (o < 0) {
r.addSorted(this.chunkcache, tp, this.rowdef.objectsize);
ti++;
continue;
}
if (o > 0) {
r.addSorted(c.chunkcache, cp, this.rowdef.objectsize);
ci++;
continue;
}
}
while (ti < this.size()) {
tp = ti * this.rowdef.objectsize;
r.addSorted(this.chunkcache, tp, this.rowdef.objectsize);
ti++;
}
while (ci < c.size()) {
cp = ci * this.rowdef.objectsize;
r.addSorted(c.chunkcache, cp, this.rowdef.objectsize);
ci++;
}
return r;
}
public synchronized String toString() {
final StringBuilder s = new StringBuilder();
final Iterator<kelondroRow.Entry> i = rows();
final Iterator<kelondroRow.Entry> i = iterator();
if (i.hasNext()) s.append(i.next().toString());
while (i.hasNext()) s.append(", " + (i.next()).toString());
return new String(s);
@ -919,7 +994,7 @@ public class kelondroRowCollection {
a.add("CCCCCCCCCCCC".getBytes());
final ArrayList<kelondroRowCollection> del = a.removeDoubles();
System.out.println(del + "rows double");
final Iterator<kelondroRow.Entry> j = a.rows();
final Iterator<kelondroRow.Entry> j = a.iterator();
while (j.hasNext()) System.out.println(new String(j.next().bytes()));
System.out.println("kelondroRowCollection test with size = " + testsize);

@ -33,7 +33,7 @@ import java.util.Random;
import de.anomic.server.logging.serverLog;
public class kelondroRowSet extends kelondroRowCollection implements kelondroIndex {
public class kelondroRowSet extends kelondroRowCollection implements kelondroIndex, Iterable<kelondroRow.Entry> {
private static final int collectionReSortLimit = 400;
@ -337,10 +337,10 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
}
}
public synchronized Iterator<kelondroRow.Entry> rows() {
public synchronized Iterator<kelondroRow.Entry> iterator() {
// iterates kelondroRow.Entry - type entries
sort();
return super.rows();
return super.iterator();
}
public synchronized kelondroCloneableIterator<kelondroRow.Entry> rows(final boolean up, final byte[] firstKey) {
@ -420,7 +420,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());
d.sort();
d.remove("fuenf".getBytes(), 0, 5);
final Iterator<kelondroRow.Entry> ii = d.rows();
final Iterator<kelondroRow.Entry> ii = d.iterator();
String s;
System.out.print("INPUT-ITERATOR: ");
kelondroRow.Entry entry;

@ -391,11 +391,7 @@ public class plasmaRankingCRProcess {
cr_entry = (kelondroRowSet) keycollection[1];
// loop over all anchors
final Iterator<kelondroRow.Entry> j = cr_entry.rows();
kelondroRow.Entry entry;
while (j.hasNext()) {
// get domain of anchors
entry = j.next();
for (kelondroRow.Entry entry: cr_entry) {
anchor = entry.getColString(0, null);
if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6);

@ -472,7 +472,7 @@ public final class plasmaWordIndex implements indexRI {
// To ensure termination an additional counter is used
int l = 0;
while ((l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) {
flushCache(theCache, Math.min(10, theCache.size()));
flushCache(theCache, Math.min(20, theCache.size()));
}
// next flush more entries if the size exceeds the maximum size of the cache
if ((theCache.size() > theCache.getMaxWordCount()) ||

@ -463,7 +463,7 @@ public final class serverFileUtils {
os = zos;
}
if(os != null) {
final Iterator<kelondroRow.Entry> i = set.rows();
final Iterator<kelondroRow.Entry> i = set.iterator();
String key;
if (i.hasNext()) {
key = new String(i.next().getColBytes(0));

Loading…
Cancel
Save