second part of 'doubles' fix - better handling of doubles in RAMIndex. More logging.

still missing: deletion of double entries in collections

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5613 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 59427064fb
commit a9ad863686

@ -44,6 +44,7 @@ public interface indexRI {
public boolean removeEntry(String wordHash, String urlHash) throws IOException; public boolean removeEntry(String wordHash, String urlHash) throws IOException;
public int removeEntries(String wordHash, Set<String> urlHashes) throws IOException; public int removeEntries(String wordHash, Set<String> urlHashes) throws IOException;
public void addEntries(indexContainer newEntries) throws IOException; public void addEntries(indexContainer newEntries) throws IOException;
public int sizeEntry(final String key);
public void clear() throws IOException; public void clear() throws IOException;
public void close(); public void close();

@ -128,10 +128,16 @@ public class RAMIndex implements ObjectIndex {
public synchronized ArrayList<RowCollection> removeDoubles() { public synchronized ArrayList<RowCollection> removeDoubles() {
// finish initialization phase explicitely // finish initialization phase explicitely
if (index1 == null) index1 = new RowSet(rowdef, 0);
index0.sort(); index0.sort();
if (index1 == null) {
return index0.removeDoubles(); return index0.removeDoubles();
} }
index1.sort();
ArrayList<RowCollection> d0 = index0.removeDoubles();
ArrayList<RowCollection> d1 = index1.removeDoubles();
d0.addAll(d1);
return d0;
}
public synchronized Row.Entry remove(final byte[] key) { public synchronized Row.Entry remove(final byte[] key) {
finishInitialization(); finishInitialization();

@ -263,7 +263,7 @@ public class kelondroCollectionIndex {
Row.Entry maxentry; Row.Entry maxentry;
int doublecount = 0; int doublecount = 0;
ArrayList<RowCollection> doubles = index.removeDoubles(); ArrayList<RowCollection> doubles = index.removeDoubles();
if (doubles.size() > 0) Log.logWarning("COLLECTION INDEX STARTUP", "found " + doubles + " doubles in collections, removing them in arrays"); if (doubles.size() > 0) Log.logWarning("COLLECTION INDEX STARTUP", "found " + doubles.size() + " doubles in collections, removing them in arrays");
for (final RowCollection doubleset: doubles) { for (final RowCollection doubleset: doubles) {
// for each entry in doubleset choose one which we want to keep // for each entry in doubleset choose one which we want to keep
maxentry = null; maxentry = null;

@ -96,8 +96,10 @@ public class EcoTable implements ObjectIndex {
this.taildef = new Row(cols, NaturalOrder.naturalOrder, -1); this.taildef = new Row(cols, NaturalOrder.naturalOrder, -1);
// initialize table file // initialize table file
boolean freshFile = false;
if (!tablefile.exists()) { if (!tablefile.exists()) {
// make new file // make new file
freshFile = true;
FileOutputStream fos = null; FileOutputStream fos = null;
try { try {
fos = new FileOutputStream(tablefile); fos = new FileOutputStream(tablefile);
@ -167,10 +169,11 @@ public class EcoTable implements ObjectIndex {
} }
} }
// check consistency // open the file
//System.out.print(" -ordering- ..");
//System.out.flush();
this.file = new BufferedEcoFS(new EcoFS(tablefile, rowdef.objectsize), this.buffersize); this.file = new BufferedEcoFS(new EcoFS(tablefile, rowdef.objectsize), this.buffersize);
// remove doubles
if (!freshFile) {
final ArrayList<Integer[]> doubles = index.removeDoubles(); final ArrayList<Integer[]> doubles = index.removeDoubles();
//assert index.size() + doubles.size() + fail == i; //assert index.size() + doubles.size() + fail == i;
//System.out.println(" -removed " + doubles.size() + " doubles- done."); //System.out.println(" -removed " + doubles.size() + " doubles- done.");
@ -199,11 +202,7 @@ public class EcoTable implements ObjectIndex {
removeInFile(top.intValue()); removeInFile(top.intValue());
} }
} }
/* try { }
assert file.size() == index.size() + doubles.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", doubles.size() = " + doubles.size() + ", fail = " + fail + ", i = " + i;
} catch (IOException e) {
e.printStackTrace();
}*/
} catch (final FileNotFoundException e) { } catch (final FileNotFoundException e) {
// should never happen // should never happen
e.printStackTrace(); e.printStackTrace();
@ -297,6 +296,7 @@ public class EcoTable implements ObjectIndex {
Integer L; Integer L;
Row.Entry inconsistentEntry; Row.Entry inconsistentEntry;
// iterate over all entries that have inconsistent index references // iterate over all entries that have inconsistent index references
long lastlog = System.currentTimeMillis();
for (final Integer[] is: index.removeDoubles()) { for (final Integer[] is: index.removeDoubles()) {
// 'is' is the set of all indexes, that have the same reference // 'is' is the set of all indexes, that have the same reference
// we collect that entries now here // we collect that entries now here
@ -318,6 +318,9 @@ public class EcoTable implements ObjectIndex {
s = d.last(); s = d.last();
d.remove(s); d.remove(s);
this.removeInFile(s.intValue()); this.removeInFile(s.intValue());
if (System.currentTimeMillis() - lastlog > 30000) {
Log.logInfo("EcoTable", "removing " + d.size() + " entries in " + this.filename());
}
} }
assert file.size() == index.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size(); assert file.size() == index.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size();
return report; return report;

Loading…
Cancel
Save