From ecb4986b38babe92efc7f2c63560cc47c03c0c5f Mon Sep 17 00:00:00 2001 From: sixcooler Date: Wed, 7 Sep 2011 18:55:16 +0000 Subject: [PATCH] refactored stuff from last commit to ReferenceContainer see: http://forum.yacy-websuche.de/viewtopic.php?f=5&t=3353&p=23163#p23163 the limiting of references is disabled per default to enable this set yacy.conf - index.maxReferences to a value of e.g. 100000 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7935 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 5 ++ source/net/yacy/kelondro/blob/ArrayStack.java | 56 ++----------------- source/net/yacy/kelondro/index/RowSet.java | 1 + .../yacy/kelondro/rwi/ReferenceContainer.java | 48 ++++++++++++++++ 4 files changed, 60 insertions(+), 50 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index eed964a98..a604632ed 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -808,6 +808,11 @@ indexDistribution.maxChunkSize = 1000 indexDistribution.startChunkSize = 200 indexDistribution.maxChunkFails = 1 +# limit of references per term & blob to the younges of this value +# a value of <= 0 disables this feature (no limit) +# a value of e.g. 100000 can improve stability and reduce load while searching very popular words +index.maxReferences = 0 + # Search sequence settings # collection: # time = time to get a RWI out of RAM cache, assortments and WORDS files diff --git a/source/net/yacy/kelondro/blob/ArrayStack.java b/source/net/yacy/kelondro/blob/ArrayStack.java index 20412a48c..047151945 100755 --- a/source/net/yacy/kelondro/blob/ArrayStack.java +++ b/source/net/yacy/kelondro/blob/ArrayStack.java @@ -30,8 +30,6 @@ import java.io.File; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; import java.util.Date; import java.util.HashSet; import java.util.Iterator; @@ -996,7 +994,7 @@ public class ArrayStack implements BLOB { assert c2 != null; e = ordering.compare(c1.getTermHash(), c2.getTermHash()); if (e < 0) { - shrink(c1); + c1.shrinkReferences(); writer.add(c1.getTermHash(), c1.exportCollection()); if (i1.hasNext()) { c1lh = c1.getTermHash(); @@ -1008,7 +1006,7 @@ public class ArrayStack implements BLOB { break; } if (e > 0) { - shrink(c2); + c2.shrinkReferences(); writer.add(c2.getTermHash(), c2.exportCollection()); if (i2.hasNext()) { c2lh = c2.getTermHash(); @@ -1022,7 +1020,7 @@ public class ArrayStack implements BLOB { assert e == 0; // merge the entries c1 = c1.merge(c2); - shrink(c1); + c1.shrinkReferences(); writer.add(c1.getTermHash(), c1.exportCollection()); c1lh = c1.getTermHash(); c2lh = c2.getTermHash(); @@ -1051,7 +1049,7 @@ public class ArrayStack implements BLOB { assert (c1 == null) || (c2 == null); while (c1 != null) { //System.out.println("FLUSH REMAINING 1: " + c1.getWordHash()); - shrink(c1); + c1.shrinkReferences(); writer.add(c1.getTermHash(), c1.exportCollection()); if (i1.hasNext()) { c1lh = c1.getTermHash(); @@ -1063,7 +1061,7 @@ public class ArrayStack implements BLOB { } while (c2 != null) { //System.out.println("FLUSH REMAINING 2: " + c2.getWordHash()); - shrink(c2); + c2.shrinkReferences(); writer.add(c2.getTermHash(), c2.exportCollection()); if (i2.hasNext()) { c2lh = c2.getTermHash(); @@ -1085,7 +1083,7 @@ public class ArrayStack implements BLOB { c = i.next(); while (true) { assert c != null; - shrink(c); + c.shrinkReferences(); writer.add(c.getTermHash(), c.exportCollection()); if (i.hasNext()) { clh = c.getTermHash(); @@ -1097,48 +1095,6 @@ public class ArrayStack implements BLOB { } // finished with writing } - - private static void shrink(final ReferenceContainer c) { - final int diff = c.size() - 100000; - if (diff <= 0) return; - final int[] indexes = oldPostions(diff, c); - Arrays.sort(indexes); - for (int i = indexes.length - 1; i >= 0; i--) { - if (indexes[i] < 0) break; - c.removeRow(indexes[i], false); - } - c.sort(); - } - - private static int[] oldPostions(final int count, final ReferenceContainer c) { - final int[] indexes = new int[count]; - int i = 0; - for (final List positions : positionsByLastMod(c)) { - for (final Integer pos : positions) { - indexes[i++] = pos; - if (i >= count) return indexes; - } - } - return indexes; - } - - private static Collection> positionsByLastMod(final ReferenceContainer c) { - long mod; - List positions; - ReferenceType r; - final TreeMap> tm = new TreeMap>(); - final Iterator i = c.entries(); - int pos = 0; - while (i.hasNext()) { - r = i.next(); - mod = r.lastModified(); - positions = tm.get(mod); - if (positions == null) positions = new ArrayList(); - positions.add(pos++); - tm.put(mod, positions); - } - return tm.values(); - } public static void main(final String[] args) { final File f = new File("/Users/admin/blobarraytest"); diff --git a/source/net/yacy/kelondro/index/RowSet.java b/source/net/yacy/kelondro/index/RowSet.java index bbc1cccc6..2e59ab419 100644 --- a/source/net/yacy/kelondro/index/RowSet.java +++ b/source/net/yacy/kelondro/index/RowSet.java @@ -236,6 +236,7 @@ public class RowSet extends RowCollection implements Index, Iterable } } + // perhaps not used - see ReferenceContainer.shrinkReferences() public final synchronized void delete(final List keys) { final int[] indexes = new int[keys.size()]; for (int i = 0; i < keys.size(); i++) { diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainer.java b/source/net/yacy/kelondro/rwi/ReferenceContainer.java index 0e5b2b06f..7264673db 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainer.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainer.java @@ -27,11 +27,16 @@ package net.yacy.kelondro.rwi; import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.ConcurrentModificationException; import java.util.Iterator; +import java.util.List; import java.util.TreeMap; +import de.anomic.search.Switchboard; + import net.yacy.cora.document.ASCII; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.Row; @@ -53,6 +58,7 @@ public class ReferenceContainer extends RowSet private byte[] termHash; protected ReferenceFactory factory; + private static int maxReferences = Switchboard.getSwitchboard().getConfigInt("index.maxReferences", 0); public ReferenceContainer(final ReferenceFactory factory, final byte[] termHash, final RowSet collection) { super(collection); @@ -185,6 +191,48 @@ public class ReferenceContainer extends RowSet while (i.hasNext()) count += (delete(i.next())) ? 1 : 0; return count; } + + public void shrinkReferences() { + final int diff = this.size() - maxReferences; + if (maxReferences <= 0 || diff <= 0) return; + final int[] indexes = oldPostions(diff); + Arrays.sort(indexes); + for (int i = indexes.length - 1; i >= 0; i--) { + if (indexes[i] < 0) break; + this.removeRow(indexes[i], false); + } + this.sort(); + } + + private int[] oldPostions(final int count) { + final int[] indexes = new int[count]; + int i = 0; + for (final List positions : positionsByLastMod()) { + for (final Integer pos : positions) { + indexes[i++] = pos; + if (i >= count) return indexes; + } + } + return indexes; + } + + private Collection> positionsByLastMod() { + long mod; + List positions; + ReferenceType r; + final TreeMap> tm = new TreeMap>(); + final Iterator i = this.entries(); + int pos = 0; + while (i.hasNext()) { + r = i.next(); + mod = r.lastModified(); + positions = tm.get(mod); + if (positions == null) positions = new ArrayList(); + positions.add(pos++); + tm.put(mod, positions); + } + return tm.values(); + } public Iterator entries() { // returns an iterator of indexRWIEntry objects