From 93ea0a47892b6933765f99c1e645503288efa20e Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Tue, 20 Apr 2010 13:45:22 +0000
Subject: [PATCH] enhanced remove operation in search consequences (which are
 triggered when the snippet fetch proves that the word has disappeared from
 the page that was stored in the index) - no direct deletion of referenced
 during search (shifted to time after search) - bundling of all deletions for
 the references of a single word into one remove operation - enhanced remove
 operation by caring that the collection is stored sorted (experimental) -
 more String -> byte[] transition for search word lists - clean up of unused
 code - enhanced memory allocation of RowSet Objects (will use a little bit
 less memory which was wasted before)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6823 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 source/de/anomic/search/ResultFetcher.java    | 30 +++++++++----------
 source/de/anomic/search/SearchEvent.java      |  5 ++--
 source/de/anomic/search/SearchEventCache.java |  2 +-
 source/de/anomic/search/TextSnippet.java      | 25 ----------------
 source/net/yacy/kelondro/blob/ArrayStack.java |  1 +
 source/net/yacy/kelondro/index/HandleSet.java |  4 +++
 .../yacy/kelondro/index/RowCollection.java    | 13 ++++++--
 source/net/yacy/kelondro/index/RowSet.java    |  2 +-
 source/net/yacy/kelondro/rwi/IndexCell.java   | 17 +----------
 .../yacy/kelondro/rwi/ReferenceContainer.java |  7 -----
 .../kelondro/rwi/ReferenceContainerCache.java | 22 +-------------
 11 files changed, 36 insertions(+), 92 deletions(-)
diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java
index 4ec99b0e3..9302ff26a 100644
--- a/source/de/anomic/search/ResultFetcher.java
+++ b/source/de/anomic/search/ResultFetcher.java
@@ -61,7 +61,7 @@ public class ResultFetcher {
     protected       Worker[]                workerThreads;
     protected final SortStore<ResultEntry>  result;
     protected final SortStore<MediaSnippet> images; // container to sort images by size
-    protected final HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string
+    protected final HandleSet               failedURLs; // a set of urlhashes that could not been verified during search
     protected final HandleSet               snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
     long urlRetrievalAllTime;
     long snippetComputationAllTime;
@@ -84,7 +84,7 @@ public class ResultFetcher {
         this.snippetComputationAllTime = 0;
         this.result = new SortStore<ResultEntry>(-1, true); // this is the result, enriched with snippets, ranked and ordered by ranking
         this.images = new SortStore<MediaSnippet>(-1, true);
-        this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
+        this.failedURLs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
         
         // snippets do not need to match with the complete query hashes,
         // only with the query minus the stopwords which had not been used for the search
@@ -167,7 +167,7 @@ public class ResultFetcher {
                     // get next entry
                     page = rankedCache.takeURL(true, taketimeout);
                     if (page == null) break;
-                    if (failedURLs.get(new String(page.hash())) != null) continue;
+                    if (failedURLs.has(page.hash())) continue;
                     
                     final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0
 
@@ -230,7 +230,7 @@ public class ResultFetcher {
                     (snippetMode == 2) ? Integer.MAX_VALUE : 30000,
                     query.isGlobal());
             final long snippetComputationTime = System.currentTimeMillis() - startTime;
-            Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
+            Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
             
             if (snippet.getErrorCode() < 11) {
                 // we loaded the file and found the snippet
@@ -241,13 +241,7 @@ public class ResultFetcher {
                 return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
             } else {
                 // problems with snippet fetch
-                registerFailure(new String(page.hash()), "no text snippet for URL " + metadata.url());
-                if (!peers.mySeed().isVirgin())
-                    try {
-                        TextSnippet.failConsequences(query.getSegment(), page.word(), snippet, query.id(false));
-                    } catch (IOException e) {
-                        Log.logException(e);
-                    }
+                registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
                 return null;
             }
         } else {
@@ -255,7 +249,7 @@ public class ResultFetcher {
             startTime = System.currentTimeMillis();
             final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal());
             final long snippetComputationTime = System.currentTimeMillis() - startTime;
-            Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
+            Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
             
             if (mediaSnippets != null && !mediaSnippets.isEmpty()) {
                 // found media snippets, return entry
@@ -264,16 +258,20 @@ public class ResultFetcher {
                 return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
             } else {
                 // problems with snippet fetch
-                registerFailure(new String(page.hash()), "no media snippet for URL " + metadata.url());
+                registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
                 return null;
             }
         }
         // finished, no more actions possible here
     }
     
-    private void registerFailure(final String urlhash, final String reason) {
-        this.failedURLs.put(urlhash, reason);
-        Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
+    private void registerFailure(final byte[] urlhash, final String reason) {
+        try {
+            this.failedURLs.put(urlhash);
+        } catch (RowSpaceExceededException e) {
+            Log.logException(e);
+        }
+        Log.logInfo("SEARCH", "sorted out urlhash " + new String(urlhash) + " during search: " + reason);
     }
     
     public int resultCount() {
diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java
index 16c2f82ae..c59f1449e 100644
--- a/source/de/anomic/search/SearchEvent.java
+++ b/source/de/anomic/search/SearchEvent.java
@@ -223,6 +223,7 @@ public final class SearchEvent {
        // execute deletion of failed words
        int rw = this.results.failedURLs.size();
        if (rw > 0) {
+           long start = System.currentTimeMillis();
            final HandleSet removeWords = query.queryHashes;
            try {
                removeWords.putAll(query.excludeHashes);
@@ -233,12 +234,12 @@ public final class SearchEvent {
                final Iterator<byte[]> j = removeWords.iterator();
                // remove the same url hashes for multiple words
                while (j.hasNext()) {
-                   this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs.keySet());
+                   this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs);
                }                    
            } catch (IOException e) {
                Log.logException(e);
            }
-           Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words");
+           Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words in " + (System.currentTimeMillis() - start) + " milliseconds");
        }
    }
    
diff --git a/source/de/anomic/search/SearchEventCache.java b/source/de/anomic/search/SearchEventCache.java
index 3f5cbe951..dbd51edda 100644
--- a/source/de/anomic/search/SearchEventCache.java
+++ b/source/de/anomic/search/SearchEventCache.java
@@ -53,7 +53,7 @@ public class SearchEventCache {
         SearchEvent event;
         while (i.hasNext()) {
             event = i.next();
-            if ((all) || (event.getEventTime() + eventLifetime < System.currentTimeMillis())) {
+            if (all || event.getEventTime() + eventLifetime < System.currentTimeMillis()) {
                 event.cleanup();
                 
                 // remove the event
diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java
index 048994aa7..bef83f455 100644
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@@ -583,29 +583,4 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
         }
     }
     
-    public static String failConsequences(Segment indexSegment, final WordReferenceVars word, final TextSnippet snippet, final String eventID) throws IOException {
-        // problems with snippet fetch
-        final byte[] urlHash = snippet.getUrl().hash();
-        final String querystring = SetTools.setToString(snippet.getRemainingHashes(), ' ');
-        if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) ||
-            (snippet.getErrorCode() == ERROR_RESOURCE_LOADING) ||
-            (snippet.getErrorCode() == ERROR_PARSER_FAILED) ||
-            (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
-            Log.logInfo("TextSnippet", "error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
-            indexSegment.urlMetadata().remove(urlHash);
-            final SearchEvent event = SearchEventCache.getEvent(eventID);
-            assert indexSegment != null;
-            assert event != null : "eventID = " + eventID;
-            assert event.getQuery() != null;
-            indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash);
-            event.remove(word);
-        }
-        if (snippet.getErrorCode() == ERROR_NO_MATCH) {
-            Log.logInfo("TextSnippet", "error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
-            indexSegment.termIndex().remove(snippet.getRemainingHashes(), urlHash);
-            SearchEventCache.getEvent(eventID).remove(word);
-        }
-        return snippet.getError();
-    }
-    
 }
diff --git a/source/net/yacy/kelondro/blob/ArrayStack.java b/source/net/yacy/kelondro/blob/ArrayStack.java
index 0b564ba10..4f9ad54ca 100755
--- a/source/net/yacy/kelondro/blob/ArrayStack.java
+++ b/source/net/yacy/kelondro/blob/ArrayStack.java
@@ -412,6 +412,7 @@ public class ArrayStack implements BLOB {
         File location;
         BLOB blob;
         public blobItem(Date creation, File location, BLOB blob) {
+            assert blob != null;
             this.creation = creation;
             this.location = location;
             this.blob = blob;
diff --git a/source/net/yacy/kelondro/index/HandleSet.java b/source/net/yacy/kelondro/index/HandleSet.java
index f3fc9e369..2046a1693 100644
--- a/source/net/yacy/kelondro/index/HandleSet.java
+++ b/source/net/yacy/kelondro/index/HandleSet.java
@@ -192,6 +192,10 @@ public final class HandleSet implements Iterable<byte[]>, Cloneable {
         index = null;
     }
     
+    public final String toString() {
+        return this.index.toString();
+    }
+    
     // set tools
     
     public HandleSet joinConstructive(final HandleSet other) throws RowSpaceExceededException {
diff --git a/source/net/yacy/kelondro/index/RowCollection.java b/source/net/yacy/kelondro/index/RowCollection.java
index 20370095e..45d4ceb91 100644
--- a/source/net/yacy/kelondro/index/RowCollection.java
+++ b/source/net/yacy/kelondro/index/RowCollection.java
@@ -199,6 +199,8 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
     public synchronized byte[] exportCollection() {
         // returns null if the collection is empty
         trim(false);
+        sort(); // experimental; supervise CPU load
+        assert this.sortBound == this.chunkcount; // on case the collection is sorted
         assert this.size() * this.rowdef.objectsize == this.chunkcache.length : "this.size() = " + this.size() + ", objectsize = " + this.rowdef.objectsize + ", chunkcache.length = " + this.chunkcache.length;
         final Row row = exportRow(chunkcache.length);
         final Row.Entry entry = row.newEntry();
@@ -227,9 +229,11 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
         if (chunkcache.length >= needed) return 0;
         assert needed > 0 : "needed = " + needed;
         long allocram = needed * growfactorLarge100 / 100L;
+        allocram -= allocram % rowdef.objectsize;
         assert allocram > 0 : "elements = " + elements + ", new = " + allocram;
         if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, false)) return allocram;
         allocram = needed * growfactorSmall100 / 100L;
+        allocram -= allocram % rowdef.objectsize;
         assert allocram > 0 : "elements = " + elements + ", new = " + allocram;
         if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, forcegc)) return allocram;
         return needed;
@@ -239,7 +243,8 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
         if (elements == 0) return;
         final long allocram = neededSpaceForEnsuredSize(elements, true);
         if (allocram == 0) return;
-        assert allocram > chunkcache.length : "wrong alloc computation: allocram = " + allocram + ", chunkcache.length = " + chunkcache.length;
+        assert chunkcache.length < elements * rowdef.objectsize : "wrong alloc computation (1): elements * rowdef.objectsize = " + (elements * rowdef.objectsize) + ", chunkcache.length = " + chunkcache.length;
+        assert allocram > chunkcache.length : "wrong alloc computation (2): allocram = " + allocram + ", chunkcache.length = " + chunkcache.length;
         if (allocram > Integer.MAX_VALUE || !MemoryControl.request(allocram, true))
         	throw new RowSpaceExceededException(allocram, "RowCollection grow");
         try {
@@ -564,7 +569,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
 
     }
     
-    protected synchronized final void sort() {
+    public synchronized final void sort() {
         assert (this.rowdef.objectOrder != null);
         if (this.sortBound == this.chunkcount) return; // this is already sorted
         if (this.chunkcount < isortlimit) {
@@ -609,6 +614,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
         //assert this.isSorted();
     }
 
+    /*
     public synchronized final void sort2() {
         assert (this.rowdef.objectOrder != null);
         if (this.sortBound == this.chunkcount) return; // this is already sorted
@@ -643,7 +649,8 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
         this.sortBound = this.chunkcount;
         //assert this.isSorted();
     }
-
+    */
+    
     private static class qsortthread implements Callable<Object> {
         private RowCollection rc;
         int L, R, S;
diff --git a/source/net/yacy/kelondro/index/RowSet.java b/source/net/yacy/kelondro/index/RowSet.java
index 3100513a3..2af40a400 100644
--- a/source/net/yacy/kelondro/index/RowSet.java
+++ b/source/net/yacy/kelondro/index/RowSet.java
@@ -75,7 +75,7 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
         if (size < 0) return new RowSet(rowdef);
         final int orderbound = (int) NaturalOrder.decodeLong(b, 10, 4);
         assert orderbound >= 0 : "orderbound = " + orderbound;
-        if (orderbound < 0) return new RowSet(rowdef);
+        if (orderbound < 0) return new RowSet(rowdef); // error
         final byte[] chunkcache = new byte[size * rowdef.objectsize];
         //assert b.length - exportOverheadSize == size * rowdef.objectsize : "b.length = " + b.length + ", size * rowdef.objectsize = " + size * rowdef.objectsize;
         if (b.length - exportOverheadSize != size * rowdef.objectsize) {
diff --git a/source/net/yacy/kelondro/rwi/IndexCell.java b/source/net/yacy/kelondro/rwi/IndexCell.java
index f797eaefa..b98c320ba 100644
--- a/source/net/yacy/kelondro/rwi/IndexCell.java
+++ b/source/net/yacy/kelondro/rwi/IndexCell.java
@@ -256,12 +256,6 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
         return removed + (reduced / this.array.rowdef().objectsize);
     }
 
-    public int remove(byte[] termHash, Set<String> urlHashes) throws IOException {
-        int removed = this.ram.remove(termHash, urlHashes);
-        int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashes));
-        return removed + (reduced / this.array.rowdef().objectsize);
-    }
-
     public boolean remove(byte[] termHash, byte[] urlHashBytes) throws IOException {
         boolean removed = this.ram.remove(termHash, urlHashBytes);
         int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashBytes));
@@ -276,16 +270,6 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
             this.urlHashes = urlHashes;
         }
         
-        public RemoveRewriter(Set<String> urlHashes) {
-            this.urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
-            for (String s: urlHashes)
-                try {
-                    this.urlHashes.put(s.getBytes());
-                } catch (RowSpaceExceededException e) {
-                    Log.logException(e);
-                }
-        }
-        
         public RemoveRewriter(byte[] urlHashBytes) {
             this.urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
             try {
@@ -296,6 +280,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
         }
         
         public ReferenceContainer<ReferenceType> rewrite(ReferenceContainer<ReferenceType> container) {
+            container.sort();
             container.removeEntries(urlHashes);
             return container;
         }
diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainer.java b/source/net/yacy/kelondro/rwi/ReferenceContainer.java
index 63612974f..456600053 100644
--- a/source/net/yacy/kelondro/rwi/ReferenceContainer.java
+++ b/source/net/yacy/kelondro/rwi/ReferenceContainer.java
@@ -188,13 +188,6 @@ public class ReferenceContainer<ReferenceType extends Reference> extends RowSet
         return count;
     }
 
-    public int removeEntries(final Set<String> urlHashes) {
-        int count = 0;
-        final Iterator<String> i = urlHashes.iterator();
-        while (i.hasNext()) count += (remove(i.next().getBytes()) == null) ? 0 : 1;
-        return count;
-    }
-
     public Iterator<ReferenceType> entries() {
         // returns an iterator of indexRWIEntry objects
         return new entryIterator();
diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java
index 07024acf0..e2de222f5 100644
--- a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java
+++ b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java
@@ -366,27 +366,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
         }
         return 0;
     }
- 
-    public int remove(final byte[] termHash, final Set<String> urlHashes) {
-        assert this.cache != null;
-        if (urlHashes.isEmpty()) return 0;
-        ByteArray tha = new ByteArray(termHash);
-        int count;
-        synchronized (cache) {
-            final ReferenceContainer<ReferenceType> c = cache.get(tha);
-            if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) {
-                // removal successful
-                if (c.isEmpty()) {
-                    delete(termHash);
-                } else {
-                    cache.put(tha, c);
-                }
-                return count;
-            }
-        }
-        return 0;
-    }
- 
+
     public void add(final ReferenceContainer<ReferenceType> container) throws RowSpaceExceededException {
         // this puts the entries into the cache
     	assert this.cache != null;