From eda055e72901df1827341f6887a4b308f5b69066 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 7 Jun 2006 15:40:29 +0000 Subject: [PATCH] - enhanced kelondroCollection speed - tested + debugged kelondroCollection - inserted the new kelondroCollection Object as indexing class for kelondroFlexTable git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2183 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../anomic/kelondro/kelondroCollection.java | 283 ++++++++++++------ .../kelondro/kelondroCollectionIndex.java | 2 +- .../de/anomic/kelondro/kelondroFlexTable.java | 42 ++- .../anomic/kelondro/kelondroNaturalOrder.java | 9 +- source/de/anomic/yacy/yacySearch.java | 1 + 5 files changed, 219 insertions(+), 118 deletions(-) diff --git a/source/de/anomic/kelondro/kelondroCollection.java b/source/de/anomic/kelondro/kelondroCollection.java index e269a051b..218617d1f 100644 --- a/source/de/anomic/kelondro/kelondroCollection.java +++ b/source/de/anomic/kelondro/kelondroCollection.java @@ -42,6 +42,7 @@ package de.anomic.kelondro; import java.util.Iterator; +import java.util.Random; public class kelondroCollection { @@ -53,14 +54,22 @@ public class kelondroCollection { private kelondroOrder order; public kelondroCollection(int objectSize) { - this(objectSize, 0, null, new byte[0]); + this(objectSize, 0); } - public kelondroCollection(int objectSize, int objectCount, kelondroOrder ordering, byte[] cache) { + public kelondroCollection(int objectSize, int objectCount) { + this.chunksize = objectSize; + this.chunkcache = new byte[objectCount * objectSize]; + this.chunkcount = 0; + this.order = null; + this.sortbound = 0; + } + + public kelondroCollection(int objectSize, int objectCount, byte[] cache) { this.chunksize = objectSize; this.chunkcache = cache; this.chunkcount = objectCount; - this.order = ordering; + this.order = null; this.sortbound = 0; } @@ -102,19 +111,26 @@ public class kelondroCollection { } public byte[] get(byte[] key) { - assert (key.length <= chunksize); + return get(key, key.length); + } + + public byte[] get(byte[] key, int length) { synchronized (chunkcache) { - int i = find(key); + int i = find(key, length); if (i >= 0) return get(i); } return null; } public void add(byte[] a) { - assert (a.length <= chunksize); + add(a, a.length); + } + + public void add(byte[] a, int length) { + int l = Math.min(this.chunksize, Math.min(length, a.length)); synchronized (chunkcache) { ensureSize(chunkcount + 1); - System.arraycopy(a, 0, chunkcache, chunksize * chunkcount, a.length); + System.arraycopy(a, 0, chunkcache, chunksize * chunkcount, l); chunkcount++; } this.lastTimeWrote = System.currentTimeMillis(); @@ -126,25 +142,27 @@ public class kelondroCollection { ensureSize(chunkcount + c.size()); } Iterator i = c.elements(); + byte[] b; while (i.hasNext()) { - add((byte[]) i.next()); + b = (byte[]) i.next(); + add(b, b.length); } } - public void remove(byte[] a) { + public void remove(byte[] a, int length) { // the byte[] a may be shorter than the chunksize if (chunkcount == 0) return; synchronized(chunkcache) { - int p = find(a); + int p = find(a, length); remove(p); } } - public void remove(byte[] a, kelondroOrder ko) { + public void remove(byte[] a, int length, kelondroOrder ko) { // the byte[] a may be shorter than the chunksize if (chunkcount == 0) return; synchronized(chunkcache) { - int p = find(a); + int p = find(a, length); remove(p); } } @@ -160,7 +178,11 @@ public class kelondroCollection { public void removeAll(kelondroCollection c) { Iterator i = c.elements(); - while (i.hasNext()) remove((byte[]) i.next()); + byte[] b; + while (i.hasNext()) { + b = (byte[]) i.next(); + remove(b, b.length); + } } public void clear() { @@ -208,41 +230,51 @@ public class kelondroCollection { return this.order; } - private int find(byte[] a) { + public void setOrdering(kelondroOrder newOrder) { + if (this.order == null) { + this.order = newOrder; + this.sortbound = 0; + } else if (!(this.order.signature().equals(newOrder.signature()))) { + this.order = newOrder; + this.sortbound = 0; + } + } + + private int find(byte[] a, int length) { // returns the chunknumber; -1 if not found - if (this.order == null) return iterativeSearch(a); + if (this.order == null) return iterativeSearch(a, length); // check if a re-sorting make sense - if (this.chunkcount - this.sortbound > 800) sort(); + if (this.chunkcount - this.sortbound > 1200) sort(Math.min(a.length, this.chunksize)); //if ((this.chunkcount - this.sortbound) / (this.chunkcount + 1) * 100 > 20) sort(); // first try to find in sorted area - int p = iterativeSearch(a); + int p = iterativeSearch(a, length); if (p >= 0) return p; // then find in unsorted area - return binarySearch(a); + return binarySearch(a, length); } - private int iterativeSearch(byte[] key) { + private int iterativeSearch(byte[] key, int length) { // returns the chunknumber if (this.order == null) { for (int i = this.sortbound; i < this.chunkcount; i++) { - if (match(key, i)) return i; + if (match(key, length, i)) return i; } return -1; } else { for (int i = this.sortbound; i < this.chunkcount; i++) { - if (compare(key, i) == 0) return i; + if (compare(key, length, i) == 0) return i; } return -1; } } - private int binarySearch(byte[] key) { + private int binarySearch(byte[] key, int length) { assert (this.order != null); int l = 0; int rbound = this.sortbound; @@ -250,7 +282,7 @@ public class kelondroCollection { int d; while (l < rbound) { p = l + ((rbound - l) >> 1); - d = compare(key, p); + d = compare(key, length, p); if (d == 0) return p; else if (d < 0) rbound = p; else l = p + 1; @@ -258,90 +290,115 @@ public class kelondroCollection { return -1; } - public void sort() { + public void sort(kelondroOrder newOrder, int keylen) { + if (this.order == null) { + this.order = newOrder; + this.sortbound = 0; + } else if (!(this.order.signature().equals(newOrder.signature()))) { + this.order = newOrder; + this.sortbound = 0; + } + sort(keylen); + } + + private void sort(int keylen) { + assert (this.order != null); if (this.sortbound == this.chunkcount) return; // this is already sorted //System.out.println("SORT"); - if (this.sortbound > 1) qsort(0, this.sortbound, this.chunkcount); - else qsort(0, this.chunkcount); + if (this.sortbound > 1) { + qsort(keylen, 0, this.sortbound, this.chunkcount); + } else { + qsort(keylen, 0, this.chunkcount); + } this.sortbound = this.chunkcount; } - private void qsort(int l, int sbound, int rbound) { - //System.out.println("QSORT: chunkcache.length=" + chunkcache.length + ", chunksize=" + chunksize + ", l=" + l + ", sbound=" + sbound + ", rbound=" + rbound); - assert (sbound <= rbound); - if (l >= rbound - 1) return; - - if (rbound - l < 1000) { - isort(l, rbound); + private void qsort(int keylen, int L, int S, int R) { + //System.out.println("QSORT: chunkcache.length=" + chunkcache.length + ", chunksize=" + chunksize + ", L=" + L + ", S=" + S + ", R=" + R); + assert (S <= R); + if (L >= R - 1) return; + if (S >= R) return; + + if (R - L < 20) { + isort(keylen, L, R); return; } - int p = l + ((sbound - l) / 2); - int q = sbound; - int qs = q; - byte[] a = new byte[chunksize]; - try { - System.arraycopy(chunkcache, p * chunksize, a, 0, chunksize); - } catch (ArrayIndexOutOfBoundsException e) { - System.out.println("EXCEPTION: chunkcache.length=" + chunkcache.length + ", p=" + p + ", chunksize=" + chunksize + ", l=" + l + ", sbound=" + sbound + ", rbound=" + rbound); - System.exit(-1); - } - p++; + int p = L + ((S - L) / 2); int ps = p; - while (q < rbound) { - if (compare(a, q) < 1) { + int q = S; + int qs = q; + int pivot = p; + while (q < R) { + if (compare(pivot, q, keylen) < 1) { q++; } else { - swap(p, q); + pivot = swap(p, q, pivot); p++; q++; } } - if (qs < p) qs = p; - if ((ps - l) <= ((p - l) / 2)) qsort(l, p); else qsort(l, ps, p); - if ((qs - p) <= ((q - p) / 2)) qsort(p, q); else qsort(p, qs, q); + if ((ps - L) <= ((p - L) / 2)) qsort(keylen, L, p); else qsort(keylen, L, ps, p); + if ((qs - p) <= ((R - p) / 2)) qsort(keylen, p, R); else qsort(keylen, p, qs, R); } - private void qsort(int l, int rbound) { - if (l >= rbound - 1) return; + private void qsort(int keylen, int L, int R) { + //System.out.println("QSORT: chunkcache.length=" + chunkcache.length + ", chunksize=" + chunksize + ", L=" + L + "/" + new String(this.chunkcache, L * this.chunksize, this.chunksize) + ", R=" + R + "/" + new String(this.chunkcache, (R - 1) * this.chunksize, this.chunksize)); + /* + if ((L == 190) && (R == 258)) { + for (int i = L; i < R; i++) { + System.out.print(new String(this.chunkcache, L * this.chunksize, this.chunksize) + ", "); + } + System.out.println(); + } + */ + if (L >= R - 1) return; - if (rbound - l < 10) { - isort(l, rbound); + if (R - L < 20) { + isort(keylen, L, R); return; } - int i = l; - int j = rbound - 1; - byte[] a = new byte[chunksize]; + int i = L; + int j = R - 1; int pivot = (i + j) / 2; - System.arraycopy(chunkcache, pivot * chunksize, a, 0, chunksize); while (i <= j) { - while (compare(a, i) == 1) i++; // chunkAt[i] < keybuffer - while (compare(a, j) == -1) j--; // chunkAt[j] > keybuffer + while (compare(pivot, i, keylen) == 1) i++; // chunkAt[i] < keybuffer + while (compare(pivot, j, keylen) == -1) j--; // chunkAt[j] > keybuffer if (i <= j) { - swap(i, j); + pivot = swap(i, j, pivot); i++; j--; } } - qsort(l, i); - qsort(i, rbound); + qsort(keylen, L, i); + qsort(keylen, i, R); } - private void isort(int l, int rbound) { - for (int i = l + 1; i < rbound; i++) - for (int j = i; j > l && compare(j - 1, j) > 0; j--) - swap(j, j - 1); + private void isort(int keylen, int L, int R) { + for (int i = L + 1; i < R; i++) + for (int j = i; j > L && compare(j - 1, j, keylen) > 0; j--) + swap(j, j - 1, 0); } - private void swap(int i, int j) { - byte[] a = new byte[chunksize]; - System.arraycopy(chunkcache, chunksize * i, a, 0, chunksize); - System.arraycopy(chunkcache, chunksize * j , chunkcache, chunksize * i, chunksize); - System.arraycopy(a, 0, chunkcache, chunksize * j, chunksize); + private int swap(int i, int j, int p) { + if (i == j) return p; + if (this.chunkcount * this.chunksize < this.chunkcache.length) { + // there is space in the chunkcache that we can use as buffer + System.arraycopy(chunkcache, chunksize * i, chunkcache, chunkcache.length - chunksize, chunksize); + System.arraycopy(chunkcache, chunksize * j , chunkcache, chunksize * i, chunksize); + System.arraycopy(chunkcache, chunkcache.length - chunksize, chunkcache, chunksize * j, chunksize); + } else { + // allocate a chunk to use as buffer + byte[] a = new byte[chunksize]; + System.arraycopy(chunkcache, chunksize * i, a, 0, chunksize); + System.arraycopy(chunkcache, chunksize * j , chunkcache, chunksize * i, chunksize); + System.arraycopy(a, 0, chunkcache, chunksize * j, chunksize); + } + if (i == p) return j; else if (j == p) return i; else return p; } - public void uniq() { + public void uniq(int keylength) { assert (this.order != null); // removes double-occurrences of chunks // this works only if the collection was ordered with sort before @@ -349,7 +406,8 @@ public class kelondroCollection { if (chunkcount <= 1) return; int i = 0; while (i < chunkcount - 1) { - if (compare(i, i + 1) == 0) { + if (compare(i, i + 1, Math.min(keylength, this.chunksize)) == 0) { + //System.out.println("DOUBLE: " + new String(this.chunkcache, this.chunksize * i, this.chunksize)); remove(i); } else { i++; @@ -370,37 +428,37 @@ public class kelondroCollection { return this.chunkcache; } - public boolean match(byte[] a, int chunknumber) { + public boolean match(byte[] a, int length, int chunknumber) { if (chunknumber >= chunkcount) return false; int i = 0; int p = chunknumber * chunksize; - final int len = a.length; - if (len > chunksize) return false; - while (i < len) - if (a[i++] != chunkcache[p++]) return false; + final int len = Math.min(length, a.length); + while (i < len) if (a[i++] != chunkcache[p++]) return false; return true; } - public int compare(byte[] a, int chunknumber) { + public int compare(byte[] a, int length, int chunknumber) { assert (chunknumber < chunkcount); - int l = Math.min(a.length, chunksize); - return this.order.compare(a, 0, a.length, chunkcache, chunknumber * chunksize, l); + int l = Math.min(this.chunksize, Math.min(a.length, length)); + return this.order.compare(a, 0, l, chunkcache, chunknumber * chunksize, l); } - public int compare(int i, int j) { + public int compare(int i, int j, int keylength) { // this can be enhanced assert (i < chunkcount); assert (j < chunkcount); - return this.order.compare(chunkcache, i * chunksize, chunksize, chunkcache, j * chunksize, chunksize); + if (i == j) return 0; + return this.order.compare(chunkcache, i * chunksize, keylength, chunkcache, j * chunksize, keylength); } public static void main(String[] args) { String[] test = { "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn" }; - kelondroCollection c = new kelondroCollection(10, 0, kelondroNaturalOrder.naturalOrder, new byte[0]); - for (int i = 0; i < test.length; i++) c.add(test[i].getBytes()); - for (int i = 0; i < test.length; i++) c.add(test[i].getBytes()); - c.sort(); - c.remove("fuenf".getBytes()); + kelondroCollection c = new kelondroCollection(10, 0); + c.setOrdering(kelondroNaturalOrder.naturalOrder); + for (int i = 0; i < test.length; i++) c.add(test[i].getBytes(), 10); + for (int i = 0; i < test.length; i++) c.add(test[i].getBytes(), 10); + c.sort(10); + c.remove("fuenf".getBytes(), 5); Iterator i = c.elements(); String s; System.out.print("INPUT-ITERATOR: "); @@ -411,27 +469,56 @@ public class kelondroCollection { } System.out.println(""); System.out.println("INPUT-TOSTRING: " + c.toString()); - c.sort(); + c.sort(10); System.out.println("SORTED : " + c.toString()); - c.uniq(); + c.uniq(10); System.out.println("UNIQ : " + c.toString()); c.trim(); System.out.println("TRIM : " + c.toString()); - c = new kelondroCollection(10, 0, kelondroNaturalOrder.naturalOrder, new byte[0]); + + // second test + c = new kelondroCollection(10, 20); + c.setOrdering(kelondroNaturalOrder.naturalOrder); + Random rand = new Random(0); long start = System.currentTimeMillis(); long t, d = 0; - byte[] w; - for (long k = 0; k < 200000; k++) { + String w; + for (long k = 0; k < 60000; k++) { + t = System.currentTimeMillis(); + w = "a" + Long.toString(rand.nextLong()); + c.add(w.getBytes(), 10); + if (k % 10000 == 0) + System.out.println("added " + k + " entries in " + + ((t - start) / 1000) + " seconds, " + + (((t - start) > 1000) ? (k / ((t - start) / 1000)) : k) + + " entries/second, size = " + c.size()); + } + System.out.println("bevore sort: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); + c.sort(10); + System.out.println("after sort: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); + c.uniq(10); + System.out.println("after uniq: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); + System.out.println("RESULT SIZE: " + c.size()); + System.out.println(); + + // third test + c = new kelondroCollection(10, 60000); + c.setOrdering(kelondroNaturalOrder.naturalOrder); + rand = new Random(0); + start = System.currentTimeMillis(); + d = 0; + for (long k = 0; k < 60000; k++) { t = System.currentTimeMillis(); - w = ("a" + Long.toString((t % 13775) + k)).getBytes(); - if (c.get(w) == null) c.add(w); else d++; - if (k % 1000 == 0) + w = "a" + Long.toString(rand.nextLong()); + if (c.get(w.getBytes(), 10) == null) c.add(w.getBytes(), 10); else d++; + if (k % 10000 == 0) System.out.println("added " + k + " entries in " + ((t - start) / 1000) + " seconds, " + - (((t - start) > 1000) ? (k / ((t - start) / 1000)) : 0) + + (((t - start) > 1000) ? (k / ((t - start) / 1000)) : k) + " entries/second, " + d + " double, size = " + c.size() + ", sum = " + (c.size() + d)); } + System.out.println("RESULT SIZE: " + c.size()); } } diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index 7b1c767f2..400dc3b3c 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -194,7 +194,7 @@ public class kelondroCollectionIndex { // read the row and define a collection int chunkcountInArray = (int) arrayrow.getColLongB256(1); if (chunkcountInArray != chunkcount) throw new kelondroException(arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, partitionnumber).toString(), "array has different chunkcount than index: index = " + chunkcount + ", array = " + chunkcountInArray); - return new kelondroCollection(chunksize, chunkcount, null /*, arrayrow.getColString(2, null)*/, arrayrow.getColBytes(3)); + return new kelondroCollection(chunksize, chunkcount, arrayrow.getColBytes(3)); } public void remove(byte[] key) throws IOException { diff --git a/source/de/anomic/kelondro/kelondroFlexTable.java b/source/de/anomic/kelondro/kelondroFlexTable.java index 4f52623e1..0278e87c5 100644 --- a/source/de/anomic/kelondro/kelondroFlexTable.java +++ b/source/de/anomic/kelondro/kelondroFlexTable.java @@ -27,23 +27,31 @@ package de.anomic.kelondro; import java.io.File; import java.io.IOException; -import java.util.HashMap; public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondroIndex { - private HashMap index; + private kelondroCollection index; + private kelondroRow indexrow; public kelondroFlexTable(File path, String tablename, kelondroRow rowdef, boolean exitOnFail) throws IOException { super(path, tablename, rowdef, exitOnFail); // fill the index - this.index = new HashMap(); + this.index = new kelondroCollection(super.row().width(0) + 4); /* kelondroFixedWidthArray indexArray = new kelondroFixedWidthArray(new File(path, colfilename(0,0))); for (int i = 0; i < indexArray.size(); i++) index.put(indexArray.get(i).getColBytes(0), new Integer(i)); indexArray.close(); */ - for (int i = 0; i < super.col[0].size(); i++) index.put(super.col[0].get(i).getColBytes(0), new Integer(i)); + this.indexrow = new kelondroRow(new int[]{super.row().width(0), 4}); + kelondroRow.Entry indexentry; + for (int i = 0; i < super.col[0].size(); i++) { + indexentry = indexrow.newEntry(); + indexentry.setCol(0, super.col[0].get(i).getColBytes(0)); + indexentry.setColLongB256(1, i); + index.add(indexentry.bytes()); + } + this.index.setOrdering(kelondroNaturalOrder.naturalOrder); } /* @@ -62,27 +70,29 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr */ public kelondroRow.Entry get(byte[] key) throws IOException { - Integer i = (Integer) this.index.get(key); - if (i == null) return null; - return super.get(i.intValue()); + kelondroRow.Entry indexentry = this.indexrow.newEntry(this.index.get(key)); + if (indexentry == null) return null; + return super.get((int) indexentry.getColLongB256(1)); } public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException { - Integer i = (Integer) this.index.get(row.getColBytes(0)); - if (i == null) { - i = new Integer(super.add(row)); - this.index.put(row.getColBytes(0), i); + kelondroRow.Entry indexentry = this.indexrow.newEntry(this.index.get(row.getColBytes(0))); + if (indexentry == null) { + indexentry = indexrow.newEntry(); + indexentry.setCol(0, row.getColBytes(0)); + indexentry.setColLongB256(1, super.add(row)); + index.add(indexentry.bytes()); return null; } else { - return super.set(i.intValue(), row); + return super.set((int) indexentry.getColLongB256(1), row); } } public kelondroRow.Entry remove(byte[] key) throws IOException { - Integer i = (Integer) this.index.get(key); - if (i == null) return null; - kelondroRow.Entry r = super.get(i.intValue()); - super.remove(i.intValue()); + kelondroRow.Entry indexentry = this.indexrow.newEntry(this.index.get(key)); + if (indexentry == null) return null; + kelondroRow.Entry r = super.get((int) indexentry.getColLongB256(1)); + super.remove((int) indexentry.getColLongB256(1)); return r; } diff --git a/source/de/anomic/kelondro/kelondroNaturalOrder.java b/source/de/anomic/kelondro/kelondroNaturalOrder.java index fa171a929..4fcac9fc4 100644 --- a/source/de/anomic/kelondro/kelondroNaturalOrder.java +++ b/source/de/anomic/kelondro/kelondroNaturalOrder.java @@ -161,10 +161,13 @@ public class kelondroNaturalOrder extends kelondroAbstractOrder implements kelon int i = 0; final int al = Math.min(alength, a.length - aoffset); final int bl = Math.min(blength, b.length - boffset); - final int len = (al > bl) ? bl : al; + final int len = Math.min(al, bl); + byte aa, bb; while (i < len) { - if (a[i + aoffset] > b[i + boffset]) return 1; - if (a[i + aoffset] < b[i + boffset]) return -1; + aa = a[i + aoffset]; + bb = b[i + boffset]; + if (aa > bb) return 1; + if (aa < bb) return -1; // else the bytes are equal and it may go on yet undecided i++; } diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index b92f43052..ca5885e4f 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -205,6 +205,7 @@ public class yacySearch extends Thread { } public static int remainingWaiting(yacySearch[] searchThreads) { + if (searchThreads == null) return 0; int alive = 0; for (int i = 0; i < searchThreads.length; i++) { if (searchThreads[i].isAlive()) alive++;