temporary patch for startup-problem:

http://www.yacy-forum.de/viewtopic.php?t=3854
This is a serious problem that is caused by the database bug between 0.511 - 0.513
which produced a large number of double-entries in the RWI index. The uniq()-method
tries to fix this, and it does not terminate when the index is large and the number
of double-occurrences is also large. This patch does simply implement a time-controlled
termination, which does not heal the inconsistency problem. The uniq-method itself
is correct and does not need a bugfix, the non-termination is simply caused by the large number
of data that is shifted during the process. It was possible to reproduce this behaviour
in a test environment.
A real fix would need to:
- enhance the uniq()-method by using a recursive, binary segmentation of the array to be fixed
- uniq() must report the entries that are double
- the double-entries must be deleted from the collection index (from the index and the collections) to heal the problem


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3583 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent e6fb6426a3
commit 2f3b518169

@ -183,7 +183,7 @@ public class kelondroCollectionIndex {
ientry.setCol(idx_col_indexpos, aentry.index());
ientry.setCol(idx_col_lastread, t);
ientry.setCol(idx_col_lastwrote, t);
index.addUnique(ientry);
index.addUnique(ientry); // FIXME: this should avoid doubles
count++;
// write a log
@ -590,7 +590,7 @@ public class kelondroCollectionIndex {
// join with new collection
oldcollection.addAllUnique(collection);
oldcollection.sort();
oldcollection.uniq(); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries
oldcollection.uniq(-1); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries
oldcollection.trim(false);
// check for size of collection:
@ -704,7 +704,7 @@ public class kelondroCollectionIndex {
// join with new collection
oldcollection.addAllUnique(collection);
oldcollection.sort();
oldcollection.uniq(); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries
oldcollection.uniq(-1); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries
oldcollection.trim(false);
collection = oldcollection;

@ -185,7 +185,7 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr
System.out.flush();
ri.sort();
int sbu = ri.size();
ri.uniq();
ri.uniq(10000);
if (ri.size() != sbu) serverLog.logSevere("kelondroFlexTable.initializeRamIndex: " + tablename, "; size before uniq = " + sbu + ", after uniq = " + ri.size());
return ri;
}

@ -473,20 +473,26 @@ public class kelondroRowCollection {
if (i == p) return j; else if (j == p) return i; else return p;
}
public synchronized void uniq() {
public synchronized void uniq(long maxtime) {
assert (this.rowdef.objectOrder != null);
// removes double-occurrences of chunks
// this works only if the collection was ordered with sort before
// if the collection is large and the number of deletions is also large,
// then this method may run a long time with 100% CPU load which is caused
// by the large number of memory movements. Therefore it is possible
// to assign a runtime limitation
long start = System.currentTimeMillis();
if (chunkcount <= 1) return;
int i = 0;
while (i < chunkcount - 1) {
//System.out.println("ENTRY0: " + serverLog.arrayList(chunkcache, rowdef.objectsize*i, rowdef.objectsize));
//System.out.println("ENTRY1: " + serverLog.arrayList(chunkcache, rowdef.objectsize*(i+1), rowdef.objectsize));
if (compare(i, i + 1) == 0) {
removeRow(i);
removeRow(i); // this decreases the chunkcount
} else {
i++;
}
if ((maxtime > 0) && (start + maxtime < System.currentTimeMillis())) break;
}
}

@ -259,6 +259,23 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
}
public static void main(String[] args) {
// sort/uniq-test
kelondroRow rowdef = new kelondroRow("Cardinal key-4 {b256}, byte[] payload-1", kelondroNaturalOrder.naturalOrder, 0);
kelondroRowSet rs = new kelondroRowSet(rowdef, 0);
Random random = new Random(0);
kelondroRow.Entry entry;
for (int i = 0; i < 10000000; i++) {
entry = rowdef.newEntry();
entry.setCol(0, Math.abs(random.nextLong() % 1000000));
entry.setCol(1, "a".getBytes());
rs.addUnique(entry);
}
System.out.println("before sort, size = " + rs.size());
rs.sort();
System.out.println("after sort, before uniq, size = " + rs.size());
rs.uniq(10000);
System.out.println("after uniq, size = " + rs.size());
/*
String[] test = { "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn" };
kelondroRowSet c = new kelondroRowSet(new kelondroRow(new int[]{10, 3}));
@ -351,7 +368,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
byte[] key;
int testsize = 5000;
byte[][] delkeys = new byte[testsize / 5][];
Random random = new Random(0);
random = new Random(0);
for (int i = 0; i < testsize; i++) {
key = randomHash(random);
if (i % 5 != 0) continue;

@ -391,7 +391,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
searchResult.addAllUnique(rcLocal);
searchResult.addAllUnique(rcContainers);
searchResult.sort();
searchResult.uniq();
searchResult.uniq(1000);
preorderTime = preorderTime - (System.currentTimeMillis() - pst);
if (preorderTime < 0) preorderTime = 200;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, searchResult, preorderTime);

Loading…
Cancel
Save