try to fix some performance problems with the internal index management:

- ensuring that ordered indexes stay ordered during remove - no unnecessary ordering checks - better test logic in crawl stacker git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5457 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 4d5b401f00
parent 4641ecd6d9
commit 4d5b401f00
5 changed files with 39 additions and 31 deletions
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -255,24 +255,26 @@ public final class CrawlStacker {

        // check if the url is double registered
        final String dbocc = nextQueue.urlExists(entry.url().hash());
-        final indexURLReference oldEntry = wordIndex.getURL(entry.url().hash(), null, 0);
-        final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
-        // do double-check
-        if ((dbocc != null) && (!recrawl)) {
-            reason = "double " + dbocc + ")";
-            if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
-            return reason;
-        }
-        if ((oldEntry != null) && (!recrawl)) {
-            reason = "double " + "LURL)";
-            if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
-            return reason;
-        }
-
-        // show potential re-crawl
-        if (recrawl && oldEntry != null) {
-            if (this.log.isFine()) this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " +
-                    ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
+        if (dbocc != null || wordIndex.existsURL(entry.url().hash())) {
+            final indexURLReference oldEntry = wordIndex.getURL(entry.url().hash(), null, 0);
+            final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
+            // do double-check
+            if ((dbocc != null) && (!recrawl)) {
+                reason = "double " + dbocc + ")";
+                if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
+                return reason;
+            }
+            if ((oldEntry != null) && (!recrawl)) {
+                reason = "double " + "LURL)";
+                if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
+                return reason;
+            }
+    
+            // show potential re-crawl
+            if (recrawl && oldEntry != null) {
+                if (this.log.isFine()) this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " +
+                        ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
+            }
        }
        
        // store information
--- a/source/de/anomic/index/indexRepositoryReference.java
+++ b/source/de/anomic/index/indexRepositoryReference.java
@ -146,7 +146,7 @@ public final class indexRepositoryReference {
        }
    }

-    public synchronized boolean exists(final String urlHash) {
+    public boolean exists(final String urlHash) {
        if (urlIndexFile == null) return false; // case may happen during shutdown
        return urlIndexFile.has(urlHash.getBytes());
    }
--- a/source/de/anomic/kelondro/kelondroRAMIndex.java
+++ b/source/de/anomic/kelondro/kelondroRAMIndex.java
@ -69,6 +69,7 @@ public class kelondroRAMIndex implements kelondroIndex {
    public synchronized kelondroRow.Entry get(final byte[] key) {
        assert (key != null);
        finishInitialization();
+        assert index0.isSorted();
        final kelondroRow.Entry indexentry = index0.get(key);
        if (indexentry != null) return indexentry;
        return index1.get(key);
@ -77,6 +78,7 @@ public class kelondroRAMIndex implements kelondroIndex {
 	public boolean has(final byte[] key) {
 		assert (key != null);
        finishInitialization();
+        assert index0.isSorted();
        if (index0.has(key)) return true;
        return index1.has(key);
 	}
@ -85,9 +87,10 @@ public class kelondroRAMIndex implements kelondroIndex {
    	assert (entry != null);
    	finishInitialization();
        // if the new entry is within the initialization part, just overwrite it
-        final kelondroRow.Entry indexentry = index0.get(entry.getPrimaryKeyBytes());
+    	assert index0.isSorted();
+        final kelondroRow.Entry indexentry = index0.remove(entry.getPrimaryKeyBytes()); // keeps ordering
        if (indexentry != null) {
-        	index0.put(entry);
+            index1.put(entry);
            return indexentry;
        }
        // else place it in the index1
--- a/source/de/anomic/kelondro/kelondroRowCollection.java
+++ b/source/de/anomic/kelondro/kelondroRowCollection.java
@ -403,9 +403,9 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
        return this.sortBound;
    }
    
-    public synchronized Iterator<byte[]> keys() {
+    public synchronized Iterator<byte[]> keys(boolean keepOrderWhenRemoving) {
        // iterates byte[] - type entries
-        return new keyIterator();
+        return new keyIterator(keepOrderWhenRemoving);
    }
    
    /**
@ -417,9 +417,11 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
    public class keyIterator implements Iterator<byte[]> {

        private int p;
+        private boolean keepOrderWhenRemoving;
        
-        public keyIterator() {
-            p = 0;
+        public keyIterator(boolean keepOrderWhenRemoving) {
+            this.p = 0;
+            this.keepOrderWhenRemoving = keepOrderWhenRemoving;
        }
        
        public boolean hasNext() {
@ -432,7 +434,7 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
        
        public void remove() {
            p--;
-            removeRow(p, false);
+            removeRow(p, keepOrderWhenRemoving);
        }
    }    

@ -446,9 +448,8 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
    
    /**
     * Iterator for kelondroRowCollection.
-     * It supports remove() though it doesn't contain the order of the underlying
+     * It supports remove() and keeps the order of the underlying
     * collection during removes.
-     *
     */
    public class rowIterator implements Iterator<kelondroRow.Entry> {

@ -468,7 +469,7 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
        
        public void remove() {
            p--;
-            removeRow(p, false);
+            removeRow(p, true);
        }

    }
@ -828,6 +829,7 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
        assert (this.rowdef.objectOrder != null);
        if (chunkcount <= 1) return true;
        if (chunkcount != this.sortBound) return false;
+        /*
        for (int i = 0; i < chunkcount - 1; i++) {
        	//System.out.println("*" + new String(get(i).getColBytes(0)));
        	if (compare(i, i + 1) > 0) {
@ -835,6 +837,7 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
        		return false;
        	}
        }
+        */
        return true;
    }
    
--- a/source/de/anomic/kelondro/kelondroRowSet.java
+++ b/source/de/anomic/kelondro/kelondroRowSet.java
@ -145,7 +145,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
        final int index = find(a, start, length);
        if (index < 0) return null;
        final kelondroRow.Entry entry = super.get(index, true);
-        super.removeRow(index, true);
+        super.removeRow(index, true); // keep order of collection!
        int findagainindex = 0;
        assert (findagainindex = find(a, start, length)) < 0 : "remove: chunk found again at index position (after  remove) " + findagainindex + ", index(before) = " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize * findagainindex, length) + ", searchkey=" + serverLog.arrayList(a, start, length); // check if the remove worked
        return entry;
@ -268,7 +268,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
    
    public synchronized Iterator<byte[]> keys() {
        sort();
-        return super.keys();
+        return super.keys(true);
    }
    
    public synchronized kelondroCloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {