refactored stuff from last commit to ReferenceContainer

see: http://forum.yacy-websuche.de/viewtopic.php?f=5&t=3353&p=23163#p23163
the limiting of references is disabled per default
to enable this set yacy.conf - index.maxReferences to a value of e.g. 100000

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7935 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
sixcooler 13 years ago
parent f7c4abfdd7
commit ecb4986b38

@ -808,6 +808,11 @@ indexDistribution.maxChunkSize = 1000
indexDistribution.startChunkSize = 200
indexDistribution.maxChunkFails = 1
# limit of references per term & blob to the younges of this value
# a value of <= 0 disables this feature (no limit)
# a value of e.g. 100000 can improve stability and reduce load while searching very popular words
index.maxReferences = 0
# Search sequence settings
# collection:
# time = time to get a RWI out of RAM cache, assortments and WORDS files

@ -30,8 +30,6 @@ import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
@ -996,7 +994,7 @@ public class ArrayStack implements BLOB {
assert c2 != null;
e = ordering.compare(c1.getTermHash(), c2.getTermHash());
if (e < 0) {
shrink(c1);
c1.shrinkReferences();
writer.add(c1.getTermHash(), c1.exportCollection());
if (i1.hasNext()) {
c1lh = c1.getTermHash();
@ -1008,7 +1006,7 @@ public class ArrayStack implements BLOB {
break;
}
if (e > 0) {
shrink(c2);
c2.shrinkReferences();
writer.add(c2.getTermHash(), c2.exportCollection());
if (i2.hasNext()) {
c2lh = c2.getTermHash();
@ -1022,7 +1020,7 @@ public class ArrayStack implements BLOB {
assert e == 0;
// merge the entries
c1 = c1.merge(c2);
shrink(c1);
c1.shrinkReferences();
writer.add(c1.getTermHash(), c1.exportCollection());
c1lh = c1.getTermHash();
c2lh = c2.getTermHash();
@ -1051,7 +1049,7 @@ public class ArrayStack implements BLOB {
assert (c1 == null) || (c2 == null);
while (c1 != null) {
//System.out.println("FLUSH REMAINING 1: " + c1.getWordHash());
shrink(c1);
c1.shrinkReferences();
writer.add(c1.getTermHash(), c1.exportCollection());
if (i1.hasNext()) {
c1lh = c1.getTermHash();
@ -1063,7 +1061,7 @@ public class ArrayStack implements BLOB {
}
while (c2 != null) {
//System.out.println("FLUSH REMAINING 2: " + c2.getWordHash());
shrink(c2);
c2.shrinkReferences();
writer.add(c2.getTermHash(), c2.exportCollection());
if (i2.hasNext()) {
c2lh = c2.getTermHash();
@ -1085,7 +1083,7 @@ public class ArrayStack implements BLOB {
c = i.next();
while (true) {
assert c != null;
shrink(c);
c.shrinkReferences();
writer.add(c.getTermHash(), c.exportCollection());
if (i.hasNext()) {
clh = c.getTermHash();
@ -1097,48 +1095,6 @@ public class ArrayStack implements BLOB {
}
// finished with writing
}
private static <ReferenceType extends Reference> void shrink(final ReferenceContainer<ReferenceType> c) {
final int diff = c.size() - 100000;
if (diff <= 0) return;
final int[] indexes = oldPostions(diff, c);
Arrays.sort(indexes);
for (int i = indexes.length - 1; i >= 0; i--) {
if (indexes[i] < 0) break;
c.removeRow(indexes[i], false);
}
c.sort();
}
private static <ReferenceType extends Reference> int[] oldPostions(final int count, final ReferenceContainer<ReferenceType> c) {
final int[] indexes = new int[count];
int i = 0;
for (final List<Integer> positions : positionsByLastMod(c)) {
for (final Integer pos : positions) {
indexes[i++] = pos;
if (i >= count) return indexes;
}
}
return indexes;
}
private static <ReferenceType extends Reference> Collection<List<Integer>> positionsByLastMod(final ReferenceContainer<ReferenceType> c) {
long mod;
List<Integer> positions;
ReferenceType r;
final TreeMap<Long, List<Integer>> tm = new TreeMap<Long, List<Integer>>();
final Iterator<ReferenceType> i = c.entries();
int pos = 0;
while (i.hasNext()) {
r = i.next();
mod = r.lastModified();
positions = tm.get(mod);
if (positions == null) positions = new ArrayList<Integer>();
positions.add(pos++);
tm.put(mod, positions);
}
return tm.values();
}
public static void main(final String[] args) {
final File f = new File("/Users/admin/blobarraytest");

@ -236,6 +236,7 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
}
}
// perhaps not used - see ReferenceContainer.shrinkReferences()
public final synchronized void delete(final List<byte[]> keys) {
final int[] indexes = new int[keys.size()];
for (int i = 0; i < keys.size(); i++) {

@ -27,11 +27,16 @@
package net.yacy.kelondro.rwi;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;
import de.anomic.search.Switchboard;
import net.yacy.cora.document.ASCII;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.Row;
@ -53,6 +58,7 @@ public class ReferenceContainer<ReferenceType extends Reference> extends RowSet
private byte[] termHash;
protected ReferenceFactory<ReferenceType> factory;
private static int maxReferences = Switchboard.getSwitchboard().getConfigInt("index.maxReferences", 0);
public ReferenceContainer(final ReferenceFactory<ReferenceType> factory, final byte[] termHash, final RowSet collection) {
super(collection);
@ -185,6 +191,48 @@ public class ReferenceContainer<ReferenceType extends Reference> extends RowSet
while (i.hasNext()) count += (delete(i.next())) ? 1 : 0;
return count;
}
public void shrinkReferences() {
final int diff = this.size() - maxReferences;
if (maxReferences <= 0 || diff <= 0) return;
final int[] indexes = oldPostions(diff);
Arrays.sort(indexes);
for (int i = indexes.length - 1; i >= 0; i--) {
if (indexes[i] < 0) break;
this.removeRow(indexes[i], false);
}
this.sort();
}
private int[] oldPostions(final int count) {
final int[] indexes = new int[count];
int i = 0;
for (final List<Integer> positions : positionsByLastMod()) {
for (final Integer pos : positions) {
indexes[i++] = pos;
if (i >= count) return indexes;
}
}
return indexes;
}
private Collection<List<Integer>> positionsByLastMod() {
long mod;
List<Integer> positions;
ReferenceType r;
final TreeMap<Long, List<Integer>> tm = new TreeMap<Long, List<Integer>>();
final Iterator<ReferenceType> i = this.entries();
int pos = 0;
while (i.hasNext()) {
r = i.next();
mod = r.lastModified();
positions = tm.get(mod);
if (positions == null) positions = new ArrayList<Integer>();
positions.add(pos++);
tm.put(mod, positions);
}
return tm.values();
}
public Iterator<ReferenceType> entries() {
// returns an iterator of indexRWIEntry objects

Loading…
Cancel
Save