Speed up remove operations in rowCollections.

- Array element shifting during remove is only done when it is necessary to keep the order of a row collection.
- This will speed up the most expensive operation "common word shrinking" by a factor of 500-1000 (in the worst cases we shifted > 60 GB of data during this operation)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4158 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
fuchsi 18 years ago
parent 5c91359297
commit b5f7df8d0a

@ -599,7 +599,6 @@ public class kelondroCollectionIndex {
int newPartitionNumber;
while ((newPartitionNumber = arrayIndex(oldcollection.size())) > maxPartitions) {
kelondroRowSet newcollection = shrinkCollection(key, oldcollection, arrayCapacity(maxPartitions));
saveCommons(key, oldcollection);
oldcollection = newcollection;
}
@ -714,7 +713,6 @@ public class kelondroCollectionIndex {
int newPartitionNumber;
while ((newPartitionNumber = arrayIndex(oldcollection.size())) > maxPartitions) {
kelondroRowSet newcollection = shrinkCollection(key, oldcollection, arrayCapacity(maxPartitions));
saveCommons(key, oldcollection);
oldcollection = newcollection;
}
@ -747,6 +745,7 @@ public class kelondroCollectionIndex {
}
private kelondroRowSet shrinkCollection(byte[] key, kelondroRowSet collection, int targetSize) {
//TODO Remove timing before release
// removes entries from collection
// the removed entries are stored in a 'commons' dump file
@ -754,23 +753,32 @@ public class kelondroCollectionIndex {
int oldsize = collection.size();
kelondroRowSet survival = new kelondroRowSet(collection.rowdef, 0);
if (oldsize <= targetSize) return survival;
long sadd1 = 0, srem1 = 0, sadd2 = 0, srem2 = 0, tot1 = 0, tot2 = 0;
long t1 = 0, t2 = 0;
// delete some entries, which are bad rated
Iterator i = collection.rows();
kelondroRow.Entry entry;
byte[] ref;
t1 = System.currentTimeMillis();
while (i.hasNext()) {
entry = (kelondroRow.Entry) i.next();
ref = entry.getColBytes(0);
if ((ref.length == 12) && (yacyURL.probablyRootURL(new String(ref)))) {
t2 = System.currentTimeMillis();
survival.addUnique(entry);
sadd1 += System.currentTimeMillis() - t2;
t2 = System.currentTimeMillis();
i.remove();
srem1 += System.currentTimeMillis() - t2;
}
}
int firstSurvival = survival.size();
tot1 = System.currentTimeMillis() - t1;
// check if we shrinked enough
Random rand = new Random(System.currentTimeMillis());
t1 = System.currentTimeMillis();
while (survival.size() > targetSize) {
// now delete randomly more entries from the survival collection
i = survival.rows();
@ -778,13 +786,22 @@ public class kelondroCollectionIndex {
entry = (kelondroRow.Entry) i.next();
ref = entry.getColBytes(0);
if (rand.nextInt() % 4 != 0) {
t2 = System.currentTimeMillis();
collection.addUnique(entry);
sadd2 += System.currentTimeMillis() - t2;
t2 = System.currentTimeMillis();
i.remove();
srem2 += System.currentTimeMillis() - t2;
}
}
}
tot2 = System.currentTimeMillis() - t1;
serverLog.logFine("kelondroCollectionIndex", "tot= "+tot1+'/'+tot2+" # add/rem(1)= "+sadd1+'/'+srem1+" # add/rem(2)= "+sadd2+'/'+srem2);
serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", survival size = " + survival.size() + ", first survival = " + firstSurvival);
//finally dump the removed entries to a file
saveCommons(key, collection);
return survival;
}

@ -363,26 +363,39 @@ public class kelondroRowCollection {
chunkcount += c.size();
}
protected synchronized final void removeRow(int p) {
/**
* This method removes the entry at position p ensuring the order of the remaining
* entries if specified by keepOrder.
* Note: Keeping the order is expensive. If you want to remove more than one element in
* a batch with this method, it'd be better to do the removes without order keeping and doing
* the sort after all the removes are done.
*
* @param p element at this position will be removed
* @param keepOrder keep the order of remaining entries
*/
protected synchronized final void removeRow(int p, boolean keepOrder) {
assert p >= 0 : "p = " + p;
assert p < chunkcount : "p = " + p + ", chunkcount = " + chunkcount;
assert chunkcount > 0 : "chunkcount = " + chunkcount;
assert sortBound <= chunkcount : "sortBound = " + sortBound + ", chunkcount = " + chunkcount;
if (p < sortBound) {
// remove by shift
System.arraycopy(
chunkcache, (p + 1) * this.rowdef.objectsize(),
if (keepOrder && (p < sortBound)) {
// remove by shift (quite expensive for big collections)
System.arraycopy(
chunkcache, (p + 1) * this.rowdef.objectsize(),
chunkcache, p * this.rowdef.objectsize(),
(chunkcount - p - 1) * this.rowdef.objectsize());
sortBound--;
} else {
// remove by copying the top-element to the remove position
if (p != chunkcount - 1) {
System.arraycopy(
chunkcache, (chunkcount - 1) * this.rowdef.objectsize(),
chunkcache, p * this.rowdef.objectsize(),
this.rowdef.objectsize());
}
// remove by copying the top-element to the remove position
if (p != chunkcount - 1) {
System.arraycopy(
chunkcache, (chunkcount - 1) * this.rowdef.objectsize(),
chunkcache, p * this.rowdef.objectsize(),
this.rowdef.objectsize());
}
// we moved the last element to the remove position: (p+1)st element
// only the first p elements keep their order
if (sortBound > p) sortBound = p;
}
chunkcount--;
this.lastTimeWrote = System.currentTimeMillis();
@ -414,6 +427,12 @@ public class kelondroRowCollection {
return new rowIterator();
}
/**
* Iterator for kelondroRowCollection.
* It supports remove() though it doesn't contain the order of the underlying
* collection during removes.
*
*/
public class rowIterator implements Iterator {
private int p;
@ -432,7 +451,7 @@ public class kelondroRowCollection {
public void remove() {
p--;
removeRow(p);
removeRow(p, false);
}
}
@ -562,7 +581,7 @@ public class kelondroRowCollection {
//System.out.println("ENTRY0: " + serverLog.arrayList(chunkcache, rowdef.objectsize*i, rowdef.objectsize));
//System.out.println("ENTRY1: " + serverLog.arrayList(chunkcache, rowdef.objectsize*(i+1), rowdef.objectsize));
if (compare(i, i + 1) == 0) {
removeRow(i); // this decreases the chunkcount
removeRow(i, true); // this decreases the chunkcount
} else {
i++;
}

@ -124,7 +124,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
if (index < 0) return null;
//System.out.println("remove: chunk found at index position (before remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length + 10) + ", searchkey=" + serverLog.arrayList(a, start, length));
kelondroRow.Entry entry = super.get(index);
super.removeRow(index);
super.removeRow(index, false);
//System.out.println("remove: chunk found at index position (after remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length) + ", searchkey=" + serverLog.arrayList(a, start, length));
int findagainindex = find(a, start, length);
//System.out.println("kelondroRowSet.remove");

Loading…
Cancel
Save