second part of 'doubles' fix - better handling of doubles in RAMIndex. More logging.

still missing: deletion of double entries in collections

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5613 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 59427064fb
commit a9ad863686

@ -44,6 +44,7 @@ public interface indexRI {
public boolean removeEntry(String wordHash, String urlHash) throws IOException;
public int removeEntries(String wordHash, Set<String> urlHashes) throws IOException;
public void addEntries(indexContainer newEntries) throws IOException;
public int sizeEntry(final String key);
public void clear() throws IOException;
public void close();

@ -128,9 +128,15 @@ public class RAMIndex implements ObjectIndex {
public synchronized ArrayList<RowCollection> removeDoubles() {
// finish initialization phase explicitely
if (index1 == null) index1 = new RowSet(rowdef, 0);
index0.sort();
return index0.removeDoubles();
index0.sort();
if (index1 == null) {
return index0.removeDoubles();
}
index1.sort();
ArrayList<RowCollection> d0 = index0.removeDoubles();
ArrayList<RowCollection> d1 = index1.removeDoubles();
d0.addAll(d1);
return d0;
}
public synchronized Row.Entry remove(final byte[] key) {

@ -263,7 +263,7 @@ public class kelondroCollectionIndex {
Row.Entry maxentry;
int doublecount = 0;
ArrayList<RowCollection> doubles = index.removeDoubles();
if (doubles.size() > 0) Log.logWarning("COLLECTION INDEX STARTUP", "found " + doubles + " doubles in collections, removing them in arrays");
if (doubles.size() > 0) Log.logWarning("COLLECTION INDEX STARTUP", "found " + doubles.size() + " doubles in collections, removing them in arrays");
for (final RowCollection doubleset: doubles) {
// for each entry in doubleset choose one which we want to keep
maxentry = null;

@ -96,8 +96,10 @@ public class EcoTable implements ObjectIndex {
this.taildef = new Row(cols, NaturalOrder.naturalOrder, -1);
// initialize table file
boolean freshFile = false;
if (!tablefile.exists()) {
// make new file
freshFile = true;
FileOutputStream fos = null;
try {
fos = new FileOutputStream(tablefile);
@ -167,43 +169,40 @@ public class EcoTable implements ObjectIndex {
}
}
// check consistency
//System.out.print(" -ordering- ..");
//System.out.flush();
// open the file
this.file = new BufferedEcoFS(new EcoFS(tablefile, rowdef.objectsize), this.buffersize);
final ArrayList<Integer[]> doubles = index.removeDoubles();
//assert index.size() + doubles.size() + fail == i;
//System.out.println(" -removed " + doubles.size() + " doubles- done.");
if (doubles.size() > 0) {
Log.logInfo("ECOTABLE", tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
// from all the doubles take one, put it back to the index and remove the others from the file
// first put back one element each
final byte[] record = new byte[rowdef.objectsize];
key = new byte[rowdef.primaryKeyLength];
for (final Integer[] ds: doubles) {
file.get(ds[0].intValue(), record, 0);
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
index.addi(key, ds[0].intValue());
}
// then remove the other doubles by removing them from the table, but do a re-indexing while doing that
// first aggregate all the delete positions because the elements from the top positions must be removed first
final TreeSet<Integer> delpos = new TreeSet<Integer>();
for (final Integer[] ds: doubles) {
for (int j = 1; j < ds.length; j++) delpos.add(ds[j]);
}
// now remove the entries in a sorted way (top-down)
Integer top;
while (delpos.size() > 0) {
top = delpos.last();
delpos.remove(top);
removeInFile(top.intValue());
// remove doubles
if (!freshFile) {
final ArrayList<Integer[]> doubles = index.removeDoubles();
//assert index.size() + doubles.size() + fail == i;
//System.out.println(" -removed " + doubles.size() + " doubles- done.");
if (doubles.size() > 0) {
Log.logInfo("ECOTABLE", tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
// from all the doubles take one, put it back to the index and remove the others from the file
// first put back one element each
final byte[] record = new byte[rowdef.objectsize];
key = new byte[rowdef.primaryKeyLength];
for (final Integer[] ds: doubles) {
file.get(ds[0].intValue(), record, 0);
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
index.addi(key, ds[0].intValue());
}
// then remove the other doubles by removing them from the table, but do a re-indexing while doing that
// first aggregate all the delete positions because the elements from the top positions must be removed first
final TreeSet<Integer> delpos = new TreeSet<Integer>();
for (final Integer[] ds: doubles) {
for (int j = 1; j < ds.length; j++) delpos.add(ds[j]);
}
// now remove the entries in a sorted way (top-down)
Integer top;
while (delpos.size() > 0) {
top = delpos.last();
delpos.remove(top);
removeInFile(top.intValue());
}
}
}
/* try {
assert file.size() == index.size() + doubles.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", doubles.size() = " + doubles.size() + ", fail = " + fail + ", i = " + i;
} catch (IOException e) {
e.printStackTrace();
}*/
} catch (final FileNotFoundException e) {
// should never happen
e.printStackTrace();
@ -297,6 +296,7 @@ public class EcoTable implements ObjectIndex {
Integer L;
Row.Entry inconsistentEntry;
// iterate over all entries that have inconsistent index references
long lastlog = System.currentTimeMillis();
for (final Integer[] is: index.removeDoubles()) {
// 'is' is the set of all indexes, that have the same reference
// we collect that entries now here
@ -318,6 +318,9 @@ public class EcoTable implements ObjectIndex {
s = d.last();
d.remove(s);
this.removeInFile(s.intValue());
if (System.currentTimeMillis() - lastlog > 30000) {
Log.logInfo("EcoTable", "removing " + d.size() + " entries in " + this.filename());
}
}
assert file.size() == index.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size();
return report;

Loading…
Cancel
Save