From f6d989aa0474f1e41b4227b774eb7ac98b4fe239 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 12 Mar 2009 23:05:18 +0000 Subject: [PATCH] added new class RowSetArray which arranges RowSet objects like Elements in a hashtable, but still provides the functionality of sorted enumeration. The new class is now integrated into the ObjectIndexCache, which is the core class to provide index functions to all database files. The new index access is about twice as fast as before. This has strong speed enhancement effects on all parts of YaCy. The speed of the kelondro indexing class ObjectIndexCache can be compared with Javas standard TreeMap with the main method in IntegerHandleIndex. The result is, that the kelondro indexing needs only 1/5 of the memory that TreeMap uses! In exchange, the kelondro classes are slower than TreeMap, about four (!) times slower. However, this is not so bad because the better use of the memory is a strong advantage and makes it possible that YaCy can maintain such a large number of document (> 50 million) in one peer. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5705 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/Balancer.java | 2 +- .../kelondro/index/IntegerHandleIndex.java | 6 +- .../kelondro/index/ObjectIndexCache.java | 36 +-- .../de/anomic/kelondro/index/RowSetArray.java | 205 ++++++++++++++++++ .../anomic/kelondro/order/MergeIterator.java | 4 + .../de/anomic/kelondro/table/SplitTable.java | 4 +- .../kelondro/text/MetadataRepository.java | 2 +- 7 files changed, 238 insertions(+), 21 deletions(-) create mode 100644 source/de/anomic/kelondro/index/RowSetArray.java diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index d3c340a18..2503d947b 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -85,7 +85,7 @@ public class Balancer { try { final Iterator i = urlFileIndex.keys(true, null); byte[] hash; - while (i.hasNext()) { + while (i != null && i.hasNext()) { hash = i.next(); pushHashToDomainStacks(new String(hash), true); } diff --git a/source/de/anomic/kelondro/index/IntegerHandleIndex.java b/source/de/anomic/kelondro/index/IntegerHandleIndex.java index befcf53f2..ba18264a0 100644 --- a/source/de/anomic/kelondro/index/IntegerHandleIndex.java +++ b/source/de/anomic/kelondro/index/IntegerHandleIndex.java @@ -33,9 +33,9 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; -import java.util.HashMap; import java.util.Iterator; import java.util.Random; +import java.util.TreeMap; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; @@ -350,7 +350,7 @@ public class IntegerHandleIndex { Integer d; System.gc(); // for resource measurement a = MemoryControl.available(); - HashMap hm = new HashMap(0); + TreeMap hm = new TreeMap(); for (int i = 0; i < count; i++) { hash = FlatWordPartitionScheme.positionToHash(r.nextInt(count)); d = hm.get(hash); @@ -364,7 +364,7 @@ public class IntegerHandleIndex { System.out.println("Used Memory: " + memj + " bytes"); System.out.println("x " + hm.get(FlatWordPartitionScheme.positionToHash(0))); System.out.println("Geschwindigkeitsfaktor j/k: " + (timej / timek)); - System.out.println("Speicherfaktor j/k: " + (memj / memk)); + System.out.println("Speicherplatzfaktor j/k: " + (memj / memk)); System.exit(0); } diff --git a/source/de/anomic/kelondro/index/ObjectIndexCache.java b/source/de/anomic/kelondro/index/ObjectIndexCache.java index da431467c..2a992889f 100644 --- a/source/de/anomic/kelondro/index/ObjectIndexCache.java +++ b/source/de/anomic/kelondro/index/ObjectIndexCache.java @@ -24,7 +24,6 @@ package de.anomic.kelondro.index; -import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -36,9 +35,10 @@ import de.anomic.kelondro.order.StackIterator; public class ObjectIndexCache implements ObjectIndex { + private static final int spread = 1000; private final Row rowdef; private RowSet index0; - private RowSet index1; + private RowSetArray index1; private final Row.EntryComparator entryComparator; public ObjectIndexCache(final Row rowdef, final int initialspace) { @@ -66,7 +66,7 @@ public class ObjectIndexCache implements ObjectIndex { // finish initialization phase index0.sort(); index0.uniq(); - index1 = new RowSet(rowdef, 0); + index1 = new RowSetArray(rowdef, 0, spread); } } @@ -166,7 +166,6 @@ public class ObjectIndexCache implements ObjectIndex { if (index1 == null) { return index0.removeDoubles(); } - index1.sort(); ArrayList d0 = index0.removeDoubles(); ArrayList d1 = index1.removeDoubles(); d0.addAll(d1); @@ -214,7 +213,7 @@ public class ObjectIndexCache implements ObjectIndex { // finish initialization phase index0.sort(); index0.uniq(); - index1 = new RowSet(rowdef, 0); + index1 = new RowSetArray(rowdef, 0, spread); return index0.keys(up, firstKey); } assert (index1 != null); @@ -224,11 +223,14 @@ public class ObjectIndexCache implements ObjectIndex { } // index0 should be sorted // sort index1 to enable working of the merge iterator - index1.sort(); //assert consistencyAnalysis0() : "consistency problem: " + consistencyAnalysis(); + CloneableIterator k0 = index0.keys(up, firstKey); + CloneableIterator k1 = index1.keys(up, firstKey); + if (k0 == null) return k1; + if (k1 == null) return k0; return new MergeIterator( - index0.keys(up, firstKey), - index1.keys(up, firstKey), + k0, + k1, rowdef.objectOrder, MergeIterator.simpleMerge, true); @@ -240,7 +242,7 @@ public class ObjectIndexCache implements ObjectIndex { // finish initialization phase index0.sort(); index0.uniq(); - index1 = new RowSet(rowdef, 0); + index1 = new RowSetArray(rowdef, 0, spread); return index0.rows(up, firstKey); } assert (index1 != null); @@ -250,23 +252,27 @@ public class ObjectIndexCache implements ObjectIndex { } // index0 should be sorted // sort index1 to enable working of the merge iterator - index1.sort(); + //index1.sort(); //assert consistencyAnalysis0() : "consistency problem: " + consistencyAnalysis(); + CloneableIterator k0 = index0.rows(up, firstKey); + CloneableIterator k1 = index1.rows(up, firstKey); + if (k0 == null) return k1; + if (k1 == null) return k0; return new MergeIterator( - index0.rows(up, firstKey), - index1.rows(up, firstKey), + k0, + k1, entryComparator, MergeIterator.simpleMerge, true); } - public synchronized CloneableIterator rows() throws IOException { + public synchronized CloneableIterator rows() { // returns the row-iterator of the underlying kelondroIndex if (index1 == null) { // finish initialization phase index0.sort(); index0.uniq(); - index1 = new RowSet(rowdef, 0); + index1 = new RowSetArray(rowdef, 0, spread); return index0.rows(); } assert (index1 != null); @@ -276,7 +282,7 @@ public class ObjectIndexCache implements ObjectIndex { } // index0 should be sorted // sort index1 to enable working of the merge iterator - index1.sort(); + //index1.sort(); //assert consistencyAnalysis0() : "consistency problem: " + consistencyAnalysis(); return new StackIterator(index0.rows(), index1.rows()); } diff --git a/source/de/anomic/kelondro/index/RowSetArray.java b/source/de/anomic/kelondro/index/RowSetArray.java new file mode 100644 index 000000000..37da24e73 --- /dev/null +++ b/source/de/anomic/kelondro/index/RowSetArray.java @@ -0,0 +1,205 @@ +// RowSetArray.java +// -------------------------- +// (C) by Michael Peter Christen; mc@yacy.net +// first published on http://yacy.net +// Frankfurt, Germany, 2009 +// last major change: 12.03.2009 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro.index; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + +import de.anomic.kelondro.index.Row.Entry; +import de.anomic.kelondro.order.CloneableIterator; +import de.anomic.kelondro.order.MergeIterator; +import de.anomic.kelondro.order.StackIterator; + +public class RowSetArray implements ObjectIndex, Iterable { + + private final int objectCount; + private final Row rowdef; + private final RowSet[] array; + + public RowSetArray(final Row rowdef, final int objectCount, final int arraySize) { + this.array = new RowSet[arraySize]; + for (int i = 0; i < arraySize; i++) { + this.array[i] = null; + } + this.rowdef = rowdef; + this.objectCount = objectCount / arraySize; + } + + private int indexFor(byte[] key) { + return (int) (this.rowdef.objectOrder.cardinal(key) % ((long) array.length)); + } + + private int indexFor(Entry row) { + return indexFor(row.getPrimaryKeyBytes()); + } + + private RowSet accessArray(int i) { + RowSet r = this.array[i]; + if (r == null) synchronized (this.array) { + r = new RowSet(this.rowdef, this.objectCount); + this.array[i] = r; + } + return r; + } + + public void addUnique(Entry row) { + accessArray(indexFor(row)).addUnique(row); + } + + public void addUnique(List rows) { + for (Entry row: rows) addUnique(row); + } + + public void clear() { + synchronized (this.array) { + for (int i = 0; i < this.array.length; i++) { + if (this.array[i] != null) this.array[i].clear(); + this.array[i] = null; + } + } + } + + public void close() { + clear(); + } + + public void deleteOnExit() { + // no nothing here + } + + public String filename() { + // we don't have a file name + return null; + } + + public Entry get(byte[] key) { + int i = indexFor(key); + RowSet r = this.array[i]; + if (r == null) return null; + return r.get(key); + } + + public boolean has(byte[] key) { + int i = indexFor(key); + RowSet r = this.array[i]; + if (r == null) return false; + return r.has(key); + } + + public CloneableIterator keys(boolean up, byte[] firstKey) { + synchronized (this.array) { + Collection> col = new ArrayList>(); + for (int i = 0; i < this.array.length; i++) { + if (this.array[i] != null) { + this.array[i].sort(); + col.add(this.array[i].keys(up, firstKey)); + } + } + return MergeIterator.cascade(col, this.rowdef.objectOrder, MergeIterator.simpleMerge, up); + } + } + + public void put(Entry row) { + accessArray(indexFor(row)).put(row); + } + + public void put(List rows) { + for (Entry row: rows) put(row); + } + + public Entry remove(byte[] key) { + return accessArray(indexFor(key)).remove(key); + } + + public ArrayList removeDoubles() { + ArrayList col = new ArrayList(); + synchronized (this.array) { + for (int i = 0; i < this.array.length; i++) { + if (this.array[i] != null) { + col.addAll(this.array[i].removeDoubles()); + if (this.array[i].size() == 0) this.array[i] = null; + } + } + } + return col; + } + + public Entry removeOne() { + synchronized (this.array) { + for (int i = 0; i < this.array.length; i++) { + if (this.array[i] != null) { + Entry entry = this.array[i].removeOne(); + if (this.array[i].size() == 0) this.array[i] = null; + return entry; + } + } + } + return null; + } + + public Entry replace(Entry row) { + return accessArray(indexFor(row)).replace(row); + } + + public Row row() { + return this.rowdef; + } + + public CloneableIterator rows(boolean up, byte[] firstKey) { + synchronized (this.array) { + Collection> col = new ArrayList>(); + for (int i = 0; i < this.array.length; i++) { + if (this.array[i] != null) { + this.array[i].sort(); + col.add(this.array[i].rows(up, firstKey)); + } + } + return StackIterator.stack(col); + } + } + + public CloneableIterator rows() { + return rows(true, null); + } + + public int size() { + int c = 0; + synchronized (this.array) { + for (int i = 0; i < this.array.length; i++) { + if (this.array[i] != null) { + c += this.array[i].size(); + } + } + } + return c; + } + + public Iterator iterator() { + return this.rows(true, null); + } + + public long inc(byte[] key, int col, long add, Entry initrow) { + return accessArray(indexFor(key)).inc(key, col, add, initrow); + } +} diff --git a/source/de/anomic/kelondro/order/MergeIterator.java b/source/de/anomic/kelondro/order/MergeIterator.java index bf9399662..1b99f0aaf 100644 --- a/source/de/anomic/kelondro/order/MergeIterator.java +++ b/source/de/anomic/kelondro/order/MergeIterator.java @@ -45,6 +45,8 @@ public class MergeIterator implements CloneableIterator { final Method m, final boolean up) { // this works currently only for String-type key iterations + assert a != null; + assert b != null; this.a = a; this.b = b; this.up = up; @@ -55,6 +57,8 @@ public class MergeIterator implements CloneableIterator { } public MergeIterator clone(final Object modifier) { + assert a != null; + assert b != null; return new MergeIterator(a.clone(modifier), b.clone(modifier), comp, merger, up); } diff --git a/source/de/anomic/kelondro/table/SplitTable.java b/source/de/anomic/kelondro/table/SplitTable.java index fdc072cec..ea94617c4 100644 --- a/source/de/anomic/kelondro/table/SplitTable.java +++ b/source/de/anomic/kelondro/table/SplitTable.java @@ -389,8 +389,10 @@ public class SplitTable implements ObjectIndex { public synchronized CloneableIterator keys(final boolean up, final byte[] firstKey) throws IOException { final List> c = new ArrayList>(tables.size()); final Iterator i = tables.values().iterator(); + CloneableIterator k; while (i.hasNext()) { - c.add(i.next().keys(up, firstKey)); + k = i.next().keys(up, firstKey); + if (k != null) c.add(k); } return MergeIterator.cascade(c, rowdef.objectOrder, MergeIterator.simpleMerge, up); } diff --git a/source/de/anomic/kelondro/text/MetadataRepository.java b/source/de/anomic/kelondro/text/MetadataRepository.java index fc8baa9a4..bf616dd0f 100644 --- a/source/de/anomic/kelondro/text/MetadataRepository.java +++ b/source/de/anomic/kelondro/text/MetadataRepository.java @@ -640,7 +640,7 @@ public final class MetadataRepository implements Iterable { ArrayList l = new ArrayList(); CloneableIterator i = this.urlIndexFile.keys(true, null); String hash; - while (i.hasNext()) { + while (i != null && i.hasNext()) { hash = new String(i.next()); if (hosthash.equals(hash.substring(6))) l.add(hash); }