From 2f3b51816936c712641116a2766049851b1a9bc4 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 20 Apr 2007 07:53:58 +0000 Subject: [PATCH] temporary patch for startup-problem: http://www.yacy-forum.de/viewtopic.php?t=3854 This is a serious problem that is caused by the database bug between 0.511 - 0.513 which produced a large number of double-entries in the RWI index. The uniq()-method tries to fix this, and it does not terminate when the index is large and the number of double-occurrences is also large. This patch does simply implement a time-controlled termination, which does not heal the inconsistency problem. The uniq-method itself is correct and does not need a bugfix, the non-termination is simply caused by the large number of data that is shifted during the process. It was possible to reproduce this behaviour in a test environment. A real fix would need to: - enhance the uniq()-method by using a recursive, binary segmentation of the array to be fixed - uniq() must report the entries that are double - the double-entries must be deleted from the collection index (from the index and the collections) to heal the problem git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3583 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../kelondro/kelondroCollectionIndex.java | 6 +++--- .../de/anomic/kelondro/kelondroFlexTable.java | 2 +- .../kelondro/kelondroRowCollection.java | 10 ++++++++-- source/de/anomic/kelondro/kelondroRowSet.java | 19 ++++++++++++++++++- .../de/anomic/plasma/plasmaSearchEvent.java | 2 +- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index cc344cb34..2eaba5931 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -183,7 +183,7 @@ public class kelondroCollectionIndex { ientry.setCol(idx_col_indexpos, aentry.index()); ientry.setCol(idx_col_lastread, t); ientry.setCol(idx_col_lastwrote, t); - index.addUnique(ientry); + index.addUnique(ientry); // FIXME: this should avoid doubles count++; // write a log @@ -590,7 +590,7 @@ public class kelondroCollectionIndex { // join with new collection oldcollection.addAllUnique(collection); oldcollection.sort(); - oldcollection.uniq(); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries + oldcollection.uniq(-1); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries oldcollection.trim(false); // check for size of collection: @@ -704,7 +704,7 @@ public class kelondroCollectionIndex { // join with new collection oldcollection.addAllUnique(collection); oldcollection.sort(); - oldcollection.uniq(); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries + oldcollection.uniq(-1); // FIXME: not clear if it would be better to insert the collection with put to avoid double-entries oldcollection.trim(false); collection = oldcollection; diff --git a/source/de/anomic/kelondro/kelondroFlexTable.java b/source/de/anomic/kelondro/kelondroFlexTable.java index b872d9324..3f884d634 100644 --- a/source/de/anomic/kelondro/kelondroFlexTable.java +++ b/source/de/anomic/kelondro/kelondroFlexTable.java @@ -185,7 +185,7 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr System.out.flush(); ri.sort(); int sbu = ri.size(); - ri.uniq(); + ri.uniq(10000); if (ri.size() != sbu) serverLog.logSevere("kelondroFlexTable.initializeRamIndex: " + tablename, "; size before uniq = " + sbu + ", after uniq = " + ri.size()); return ri; } diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java index 65677b4d8..fa29055b6 100644 --- a/source/de/anomic/kelondro/kelondroRowCollection.java +++ b/source/de/anomic/kelondro/kelondroRowCollection.java @@ -473,20 +473,26 @@ public class kelondroRowCollection { if (i == p) return j; else if (j == p) return i; else return p; } - public synchronized void uniq() { + public synchronized void uniq(long maxtime) { assert (this.rowdef.objectOrder != null); // removes double-occurrences of chunks // this works only if the collection was ordered with sort before + // if the collection is large and the number of deletions is also large, + // then this method may run a long time with 100% CPU load which is caused + // by the large number of memory movements. Therefore it is possible + // to assign a runtime limitation + long start = System.currentTimeMillis(); if (chunkcount <= 1) return; int i = 0; while (i < chunkcount - 1) { //System.out.println("ENTRY0: " + serverLog.arrayList(chunkcache, rowdef.objectsize*i, rowdef.objectsize)); //System.out.println("ENTRY1: " + serverLog.arrayList(chunkcache, rowdef.objectsize*(i+1), rowdef.objectsize)); if (compare(i, i + 1) == 0) { - removeRow(i); + removeRow(i); // this decreases the chunkcount } else { i++; } + if ((maxtime > 0) && (start + maxtime < System.currentTimeMillis())) break; } } diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index 74910be7d..8ef2e2992 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -259,6 +259,23 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd } public static void main(String[] args) { + // sort/uniq-test + kelondroRow rowdef = new kelondroRow("Cardinal key-4 {b256}, byte[] payload-1", kelondroNaturalOrder.naturalOrder, 0); + kelondroRowSet rs = new kelondroRowSet(rowdef, 0); + Random random = new Random(0); + kelondroRow.Entry entry; + for (int i = 0; i < 10000000; i++) { + entry = rowdef.newEntry(); + entry.setCol(0, Math.abs(random.nextLong() % 1000000)); + entry.setCol(1, "a".getBytes()); + rs.addUnique(entry); + } + System.out.println("before sort, size = " + rs.size()); + rs.sort(); + System.out.println("after sort, before uniq, size = " + rs.size()); + rs.uniq(10000); + System.out.println("after uniq, size = " + rs.size()); + /* String[] test = { "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn" }; kelondroRowSet c = new kelondroRowSet(new kelondroRow(new int[]{10, 3})); @@ -351,7 +368,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd byte[] key; int testsize = 5000; byte[][] delkeys = new byte[testsize / 5][]; - Random random = new Random(0); + random = new Random(0); for (int i = 0; i < testsize; i++) { key = randomHash(random); if (i % 5 != 0) continue; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index a0540cc45..7e7ec3402 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -391,7 +391,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { searchResult.addAllUnique(rcLocal); searchResult.addAllUnique(rcContainers); searchResult.sort(); - searchResult.uniq(); + searchResult.uniq(1000); preorderTime = preorderTime - (System.currentTimeMillis() - pst); if (preorderTime < 0) preorderTime = 200; plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, searchResult, preorderTime);