second part of 'doubles' fix - better handling of doubles in RAMIndex. More logging.

still missing: deletion of double entries in collections git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5613 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · a9ad863686
parent 59427064fb
commit a9ad863686
4 changed files with 48 additions and 38 deletions
--- a/source/de/anomic/index/indexRI.java
+++ b/source/de/anomic/index/indexRI.java
@ -44,6 +44,7 @@ public interface indexRI {
    public boolean removeEntry(String wordHash, String urlHash) throws IOException;
    public int removeEntries(String wordHash, Set<String> urlHashes) throws IOException;
    public void addEntries(indexContainer newEntries) throws IOException;
+    public int sizeEntry(final String key);
    public void clear() throws IOException;
    public void close();

--- a/source/de/anomic/kelondro/index/RAMIndex.java
+++ b/source/de/anomic/kelondro/index/RAMIndex.java
@ -128,9 +128,15 @@ public class RAMIndex implements ObjectIndex {
 	
 	public synchronized ArrayList<RowCollection> removeDoubles() {
 	    // finish initialization phase explicitely
-	    if (index1 == null) index1 = new RowSet(rowdef, 0);
-	    index0.sort();
-	    return index0.removeDoubles();
+        index0.sort();
+	    if (index1 == null) {
+	        return index0.removeDoubles();
+	    }
+        index1.sort();
+        ArrayList<RowCollection> d0 = index0.removeDoubles();
+        ArrayList<RowCollection> d1 = index1.removeDoubles();
+        d0.addAll(d1);
+        return d0;
 	}
 	
    public synchronized Row.Entry remove(final byte[] key) {
--- a/source/de/anomic/kelondro/kelondroCollectionIndex.java
+++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java
@ -263,7 +263,7 @@ public class kelondroCollectionIndex {
        Row.Entry maxentry;
        int doublecount = 0;
        ArrayList<RowCollection> doubles = index.removeDoubles();
-        if (doubles.size() > 0) Log.logWarning("COLLECTION INDEX STARTUP", "found " + doubles + " doubles in collections, removing them in arrays");
+        if (doubles.size() > 0) Log.logWarning("COLLECTION INDEX STARTUP", "found " + doubles.size() + " doubles in collections, removing them in arrays");
        for (final RowCollection doubleset: doubles) {
            // for each entry in doubleset choose one which we want to keep
            maxentry = null;
--- a/source/de/anomic/kelondro/table/EcoTable.java
+++ b/source/de/anomic/kelondro/table/EcoTable.java
@ -96,8 +96,10 @@ public class EcoTable implements ObjectIndex {
        this.taildef = new Row(cols, NaturalOrder.naturalOrder, -1);
        
        // initialize table file
+        boolean freshFile = false;
        if (!tablefile.exists()) {
            // make new file
+            freshFile = true;
            FileOutputStream fos = null;
            try {
                fos = new FileOutputStream(tablefile);
@ -167,43 +169,40 @@ public class EcoTable implements ObjectIndex {
                }
            }
            
-            // check consistency
-            //System.out.print(" -ordering- ..");
-            //System.out.flush();
+            // open the file
            this.file = new BufferedEcoFS(new EcoFS(tablefile, rowdef.objectsize), this.buffersize);
-            final ArrayList<Integer[]> doubles = index.removeDoubles();
-            //assert index.size() + doubles.size() + fail == i;
-            //System.out.println(" -removed " + doubles.size() + " doubles- done.");
-            if (doubles.size() > 0) {
-                Log.logInfo("ECOTABLE", tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
-                // from all the doubles take one, put it back to the index and remove the others from the file
-                // first put back one element each
-                final byte[] record = new byte[rowdef.objectsize];
-                key = new byte[rowdef.primaryKeyLength];
-                for (final Integer[] ds: doubles) {
-                    file.get(ds[0].intValue(), record, 0);
-                    System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
-                    index.addi(key, ds[0].intValue());
-                }
-                // then remove the other doubles by removing them from the table, but do a re-indexing while doing that
-                // first aggregate all the delete positions because the elements from the top positions must be removed first
-                final TreeSet<Integer> delpos = new TreeSet<Integer>();
-                for (final Integer[] ds: doubles) {
-                    for (int j = 1; j < ds.length; j++) delpos.add(ds[j]);
-                }
-                // now remove the entries in a sorted way (top-down)
-                Integer top;
-                while (delpos.size() > 0) {
-                    top = delpos.last();
-                    delpos.remove(top);
-                    removeInFile(top.intValue());
+ 
+            // remove doubles
+            if (!freshFile) {
+                final ArrayList<Integer[]> doubles = index.removeDoubles();
+                //assert index.size() + doubles.size() + fail == i;
+                //System.out.println(" -removed " + doubles.size() + " doubles- done.");
+                if (doubles.size() > 0) {
+                    Log.logInfo("ECOTABLE", tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
+                    // from all the doubles take one, put it back to the index and remove the others from the file
+                    // first put back one element each
+                    final byte[] record = new byte[rowdef.objectsize];
+                    key = new byte[rowdef.primaryKeyLength];
+                    for (final Integer[] ds: doubles) {
+                        file.get(ds[0].intValue(), record, 0);
+                        System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
+                        index.addi(key, ds[0].intValue());
+                    }
+                    // then remove the other doubles by removing them from the table, but do a re-indexing while doing that
+                    // first aggregate all the delete positions because the elements from the top positions must be removed first
+                    final TreeSet<Integer> delpos = new TreeSet<Integer>();
+                    for (final Integer[] ds: doubles) {
+                        for (int j = 1; j < ds.length; j++) delpos.add(ds[j]);
+                    }
+                    // now remove the entries in a sorted way (top-down)
+                    Integer top;
+                    while (delpos.size() > 0) {
+                        top = delpos.last();
+                        delpos.remove(top);
+                        removeInFile(top.intValue());
+                    }
                }
            }
-            /* try {
-                assert file.size() == index.size() + doubles.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", doubles.size() = " + doubles.size() + ", fail = " + fail + ", i = " + i;
-            } catch (IOException e) {
-                e.printStackTrace();
-            }*/
        } catch (final FileNotFoundException e) {
            // should never happen
            e.printStackTrace();
@ -297,6 +296,7 @@ public class EcoTable implements ObjectIndex {
        Integer L;
        Row.Entry inconsistentEntry;
        // iterate over all entries that have inconsistent index references
+        long lastlog = System.currentTimeMillis();
        for (final Integer[] is: index.removeDoubles()) {
            // 'is' is the set of all indexes, that have the same reference
            // we collect that entries now here
@ -318,6 +318,9 @@ public class EcoTable implements ObjectIndex {
            s = d.last();
            d.remove(s);
            this.removeInFile(s.intValue());
+            if (System.currentTimeMillis() - lastlog > 30000) {
+                Log.logInfo("EcoTable", "removing " + d.size() + " entries in " + this.filename());
+            }
        }
        assert file.size() == index.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size();
        return report;