second part of 'doubles' fix - better handling of doubles in RAMIndex. More logging.

still missing: deletion of double entries in collections

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5613 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 59427064fb
commit a9ad863686

@ -44,6 +44,7 @@ public interface indexRI {
public boolean removeEntry(String wordHash, String urlHash) throws IOException; public boolean removeEntry(String wordHash, String urlHash) throws IOException;
public int removeEntries(String wordHash, Set<String> urlHashes) throws IOException; public int removeEntries(String wordHash, Set<String> urlHashes) throws IOException;
public void addEntries(indexContainer newEntries) throws IOException; public void addEntries(indexContainer newEntries) throws IOException;
public int sizeEntry(final String key);
public void clear() throws IOException; public void clear() throws IOException;
public void close(); public void close();

@ -128,9 +128,15 @@ public class RAMIndex implements ObjectIndex {
public synchronized ArrayList<RowCollection> removeDoubles() { public synchronized ArrayList<RowCollection> removeDoubles() {
// finish initialization phase explicitely // finish initialization phase explicitely
if (index1 == null) index1 = new RowSet(rowdef, 0); index0.sort();
index0.sort(); if (index1 == null) {
return index0.removeDoubles(); return index0.removeDoubles();
}
index1.sort();
ArrayList<RowCollection> d0 = index0.removeDoubles();
ArrayList<RowCollection> d1 = index1.removeDoubles();
d0.addAll(d1);
return d0;
} }
public synchronized Row.Entry remove(final byte[] key) { public synchronized Row.Entry remove(final byte[] key) {

@ -263,7 +263,7 @@ public class kelondroCollectionIndex {
Row.Entry maxentry; Row.Entry maxentry;
int doublecount = 0; int doublecount = 0;
ArrayList<RowCollection> doubles = index.removeDoubles(); ArrayList<RowCollection> doubles = index.removeDoubles();
if (doubles.size() > 0) Log.logWarning("COLLECTION INDEX STARTUP", "found " + doubles + " doubles in collections, removing them in arrays"); if (doubles.size() > 0) Log.logWarning("COLLECTION INDEX STARTUP", "found " + doubles.size() + " doubles in collections, removing them in arrays");
for (final RowCollection doubleset: doubles) { for (final RowCollection doubleset: doubles) {
// for each entry in doubleset choose one which we want to keep // for each entry in doubleset choose one which we want to keep
maxentry = null; maxentry = null;

@ -96,8 +96,10 @@ public class EcoTable implements ObjectIndex {
this.taildef = new Row(cols, NaturalOrder.naturalOrder, -1); this.taildef = new Row(cols, NaturalOrder.naturalOrder, -1);
// initialize table file // initialize table file
boolean freshFile = false;
if (!tablefile.exists()) { if (!tablefile.exists()) {
// make new file // make new file
freshFile = true;
FileOutputStream fos = null; FileOutputStream fos = null;
try { try {
fos = new FileOutputStream(tablefile); fos = new FileOutputStream(tablefile);
@ -167,43 +169,40 @@ public class EcoTable implements ObjectIndex {
} }
} }
// check consistency // open the file
//System.out.print(" -ordering- ..");
//System.out.flush();
this.file = new BufferedEcoFS(new EcoFS(tablefile, rowdef.objectsize), this.buffersize); this.file = new BufferedEcoFS(new EcoFS(tablefile, rowdef.objectsize), this.buffersize);
final ArrayList<Integer[]> doubles = index.removeDoubles();
//assert index.size() + doubles.size() + fail == i; // remove doubles
//System.out.println(" -removed " + doubles.size() + " doubles- done."); if (!freshFile) {
if (doubles.size() > 0) { final ArrayList<Integer[]> doubles = index.removeDoubles();
Log.logInfo("ECOTABLE", tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles"); //assert index.size() + doubles.size() + fail == i;
// from all the doubles take one, put it back to the index and remove the others from the file //System.out.println(" -removed " + doubles.size() + " doubles- done.");
// first put back one element each if (doubles.size() > 0) {
final byte[] record = new byte[rowdef.objectsize]; Log.logInfo("ECOTABLE", tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
key = new byte[rowdef.primaryKeyLength]; // from all the doubles take one, put it back to the index and remove the others from the file
for (final Integer[] ds: doubles) { // first put back one element each
file.get(ds[0].intValue(), record, 0); final byte[] record = new byte[rowdef.objectsize];
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength); key = new byte[rowdef.primaryKeyLength];
index.addi(key, ds[0].intValue()); for (final Integer[] ds: doubles) {
} file.get(ds[0].intValue(), record, 0);
// then remove the other doubles by removing them from the table, but do a re-indexing while doing that System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
// first aggregate all the delete positions because the elements from the top positions must be removed first index.addi(key, ds[0].intValue());
final TreeSet<Integer> delpos = new TreeSet<Integer>(); }
for (final Integer[] ds: doubles) { // then remove the other doubles by removing them from the table, but do a re-indexing while doing that
for (int j = 1; j < ds.length; j++) delpos.add(ds[j]); // first aggregate all the delete positions because the elements from the top positions must be removed first
} final TreeSet<Integer> delpos = new TreeSet<Integer>();
// now remove the entries in a sorted way (top-down) for (final Integer[] ds: doubles) {
Integer top; for (int j = 1; j < ds.length; j++) delpos.add(ds[j]);
while (delpos.size() > 0) { }
top = delpos.last(); // now remove the entries in a sorted way (top-down)
delpos.remove(top); Integer top;
removeInFile(top.intValue()); while (delpos.size() > 0) {
top = delpos.last();
delpos.remove(top);
removeInFile(top.intValue());
}
} }
} }
/* try {
assert file.size() == index.size() + doubles.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", doubles.size() = " + doubles.size() + ", fail = " + fail + ", i = " + i;
} catch (IOException e) {
e.printStackTrace();
}*/
} catch (final FileNotFoundException e) { } catch (final FileNotFoundException e) {
// should never happen // should never happen
e.printStackTrace(); e.printStackTrace();
@ -297,6 +296,7 @@ public class EcoTable implements ObjectIndex {
Integer L; Integer L;
Row.Entry inconsistentEntry; Row.Entry inconsistentEntry;
// iterate over all entries that have inconsistent index references // iterate over all entries that have inconsistent index references
long lastlog = System.currentTimeMillis();
for (final Integer[] is: index.removeDoubles()) { for (final Integer[] is: index.removeDoubles()) {
// 'is' is the set of all indexes, that have the same reference // 'is' is the set of all indexes, that have the same reference
// we collect that entries now here // we collect that entries now here
@ -318,6 +318,9 @@ public class EcoTable implements ObjectIndex {
s = d.last(); s = d.last();
d.remove(s); d.remove(s);
this.removeInFile(s.intValue()); this.removeInFile(s.intValue());
if (System.currentTimeMillis() - lastlog > 30000) {
Log.logInfo("EcoTable", "removing " + d.size() + " entries in " + this.filename());
}
} }
assert file.size() == index.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size(); assert file.size() == index.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size();
return report; return report;

Loading…
Cancel
Save