enhanced remove operation in search consequences (which are triggered when the snippet fetch proves that the word has disappeared from the page that was stored in the index)

- no direct deletion of referenced during search (shifted to time after search) - bundling of all deletions for the references of a single word into one remove operation - enhanced remove operation by caring that the collection is stored sorted (experimental) - more String -> byte[] transition for search word lists - clean up of unused code - enhanced memory allocation of RowSet Objects (will use a little bit less memory which was wasted before) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6823 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 93ea0a4789
parent 7a59012632
commit 93ea0a4789
11 changed files with 36 additions and 92 deletions
--- a/source/de/anomic/search/ResultFetcher.java
+++ b/source/de/anomic/search/ResultFetcher.java
@ -61,7 +61,7 @@ public class ResultFetcher {
    protected       Worker[]                workerThreads;
    protected final SortStore<ResultEntry>  result;
    protected final SortStore<MediaSnippet> images; // container to sort images by size
-    protected final HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string
+    protected final HandleSet               failedURLs; // a set of urlhashes that could not been verified during search
    protected final HandleSet               snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
    long urlRetrievalAllTime;
    long snippetComputationAllTime;
@ -84,7 +84,7 @@ public class ResultFetcher {
        this.snippetComputationAllTime = 0;
        this.result = new SortStore<ResultEntry>(-1, true); // this is the result, enriched with snippets, ranked and ordered by ranking
        this.images = new SortStore<MediaSnippet>(-1, true);
-        this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
+        this.failedURLs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
        
        // snippets do not need to match with the complete query hashes,
        // only with the query minus the stopwords which had not been used for the search
@ -167,7 +167,7 @@ public class ResultFetcher {
                    // get next entry
                    page = rankedCache.takeURL(true, taketimeout);
                    if (page == null) break;
-                    if (failedURLs.get(new String(page.hash())) != null) continue;
+                    if (failedURLs.has(page.hash())) continue;
                    
                    final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0

@ -230,7 +230,7 @@ public class ResultFetcher {
                    (snippetMode == 2) ? Integer.MAX_VALUE : 30000,
                    query.isGlobal());
            final long snippetComputationTime = System.currentTimeMillis() - startTime;
-            Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
+            Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
            
            if (snippet.getErrorCode() < 11) {
                // we loaded the file and found the snippet
@ -241,13 +241,7 @@ public class ResultFetcher {
                return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
            } else {
                // problems with snippet fetch
-                registerFailure(new String(page.hash()), "no text snippet for URL " + metadata.url());
-                if (!peers.mySeed().isVirgin())
-                    try {
-                        TextSnippet.failConsequences(query.getSegment(), page.word(), snippet, query.id(false));
-                    } catch (IOException e) {
-                        Log.logException(e);
-                    }
+                registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
                return null;
            }
        } else {
@ -255,7 +249,7 @@ public class ResultFetcher {
            startTime = System.currentTimeMillis();
            final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal());
            final long snippetComputationTime = System.currentTimeMillis() - startTime;
-            Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
+            Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
            
            if (mediaSnippets != null && !mediaSnippets.isEmpty()) {
                // found media snippets, return entry
@ -264,16 +258,20 @@ public class ResultFetcher {
                return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
            } else {
                // problems with snippet fetch
-                registerFailure(new String(page.hash()), "no media snippet for URL " + metadata.url());
+                registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
                return null;
            }
        }
        // finished, no more actions possible here
    }
    
-    private void registerFailure(final String urlhash, final String reason) {
-        this.failedURLs.put(urlhash, reason);
-        Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
+    private void registerFailure(final byte[] urlhash, final String reason) {
+        try {
+            this.failedURLs.put(urlhash);
+        } catch (RowSpaceExceededException e) {
+            Log.logException(e);
+        }
+        Log.logInfo("SEARCH", "sorted out urlhash " + new String(urlhash) + " during search: " + reason);
    }
    
    public int resultCount() {
--- a/source/de/anomic/search/SearchEvent.java
+++ b/source/de/anomic/search/SearchEvent.java
@ -223,6 +223,7 @@ public final class SearchEvent {
       // execute deletion of failed words
       int rw = this.results.failedURLs.size();
       if (rw > 0) {
+           long start = System.currentTimeMillis();
           final HandleSet removeWords = query.queryHashes;
           try {
               removeWords.putAll(query.excludeHashes);
@ -233,12 +234,12 @@ public final class SearchEvent {
               final Iterator<byte[]> j = removeWords.iterator();
               // remove the same url hashes for multiple words
               while (j.hasNext()) {
-                   this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs.keySet());
+                   this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs);
               }                    
           } catch (IOException e) {
               Log.logException(e);
           }
-           Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words");
+           Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words in " + (System.currentTimeMillis() - start) + " milliseconds");
       }
   }
   
--- a/source/de/anomic/search/SearchEventCache.java
+++ b/source/de/anomic/search/SearchEventCache.java
@ -53,7 +53,7 @@ public class SearchEventCache {
        SearchEvent event;
        while (i.hasNext()) {
            event = i.next();
-            if ((all) || (event.getEventTime() + eventLifetime < System.currentTimeMillis())) {
+            if (all || event.getEventTime() + eventLifetime < System.currentTimeMillis()) {
                event.cleanup();
                
                // remove the event
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@ -583,29 +583,4 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
        }
    }
    
-    public static String failConsequences(Segment indexSegment, final WordReferenceVars word, final TextSnippet snippet, final String eventID) throws IOException {
-        // problems with snippet fetch
-        final byte[] urlHash = snippet.getUrl().hash();
-        final String querystring = SetTools.setToString(snippet.getRemainingHashes(), ' ');
-        if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) ||
-            (snippet.getErrorCode() == ERROR_RESOURCE_LOADING) ||
-            (snippet.getErrorCode() == ERROR_PARSER_FAILED) ||
-            (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
-            Log.logInfo("TextSnippet", "error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
-            indexSegment.urlMetadata().remove(urlHash);
-            final SearchEvent event = SearchEventCache.getEvent(eventID);
-            assert indexSegment != null;
-            assert event != null : "eventID = " + eventID;
-            assert event.getQuery() != null;
-            indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash);
-            event.remove(word);
-        }
-        if (snippet.getErrorCode() == ERROR_NO_MATCH) {
-            Log.logInfo("TextSnippet", "error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
-            indexSegment.termIndex().remove(snippet.getRemainingHashes(), urlHash);
-            SearchEventCache.getEvent(eventID).remove(word);
-        }
-        return snippet.getError();
-    }
-    
 }
--- a/source/net/yacy/kelondro/blob/ArrayStack.java
+++ b/source/net/yacy/kelondro/blob/ArrayStack.java
@ -412,6 +412,7 @@ public class ArrayStack implements BLOB {
        File location;
        BLOB blob;
        public blobItem(Date creation, File location, BLOB blob) {
+            assert blob != null;
            this.creation = creation;
            this.location = location;
            this.blob = blob;
--- a/source/net/yacy/kelondro/index/HandleSet.java
+++ b/source/net/yacy/kelondro/index/HandleSet.java
@ -192,6 +192,10 @@ public final class HandleSet implements Iterable<byte[]>, Cloneable {
        index = null;
    }
    
+    public final String toString() {
+        return this.index.toString();
+    }
+    
    // set tools
    
    public HandleSet joinConstructive(final HandleSet other) throws RowSpaceExceededException {
--- a/source/net/yacy/kelondro/index/RowCollection.java
+++ b/source/net/yacy/kelondro/index/RowCollection.java
@ -199,6 +199,8 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
    public synchronized byte[] exportCollection() {
        // returns null if the collection is empty
        trim(false);
+        sort(); // experimental; supervise CPU load
+        assert this.sortBound == this.chunkcount; // on case the collection is sorted
        assert this.size() * this.rowdef.objectsize == this.chunkcache.length : "this.size() = " + this.size() + ", objectsize = " + this.rowdef.objectsize + ", chunkcache.length = " + this.chunkcache.length;
        final Row row = exportRow(chunkcache.length);
        final Row.Entry entry = row.newEntry();
@ -227,9 +229,11 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
        if (chunkcache.length >= needed) return 0;
        assert needed > 0 : "needed = " + needed;
        long allocram = needed * growfactorLarge100 / 100L;
+        allocram -= allocram % rowdef.objectsize;
        assert allocram > 0 : "elements = " + elements + ", new = " + allocram;
        if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, false)) return allocram;
        allocram = needed * growfactorSmall100 / 100L;
+        allocram -= allocram % rowdef.objectsize;
        assert allocram > 0 : "elements = " + elements + ", new = " + allocram;
        if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, forcegc)) return allocram;
        return needed;
@ -239,7 +243,8 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
        if (elements == 0) return;
        final long allocram = neededSpaceForEnsuredSize(elements, true);
        if (allocram == 0) return;
-        assert allocram > chunkcache.length : "wrong alloc computation: allocram = " + allocram + ", chunkcache.length = " + chunkcache.length;
+        assert chunkcache.length < elements * rowdef.objectsize : "wrong alloc computation (1): elements * rowdef.objectsize = " + (elements * rowdef.objectsize) + ", chunkcache.length = " + chunkcache.length;
+        assert allocram > chunkcache.length : "wrong alloc computation (2): allocram = " + allocram + ", chunkcache.length = " + chunkcache.length;
        if (allocram > Integer.MAX_VALUE || !MemoryControl.request(allocram, true))
        	throw new RowSpaceExceededException(allocram, "RowCollection grow");
        try {
@ -564,7 +569,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {

    }
    
-    protected synchronized final void sort() {
+    public synchronized final void sort() {
        assert (this.rowdef.objectOrder != null);
        if (this.sortBound == this.chunkcount) return; // this is already sorted
        if (this.chunkcount < isortlimit) {
@ -609,6 +614,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
        //assert this.isSorted();
    }

+    /*
    public synchronized final void sort2() {
        assert (this.rowdef.objectOrder != null);
        if (this.sortBound == this.chunkcount) return; // this is already sorted
@ -643,7 +649,8 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
        this.sortBound = this.chunkcount;
        //assert this.isSorted();
    }
-
+    */
+    
    private static class qsortthread implements Callable<Object> {
        private RowCollection rc;
        int L, R, S;
--- a/source/net/yacy/kelondro/index/RowSet.java
+++ b/source/net/yacy/kelondro/index/RowSet.java
@ -75,7 +75,7 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
        if (size < 0) return new RowSet(rowdef);
        final int orderbound = (int) NaturalOrder.decodeLong(b, 10, 4);
        assert orderbound >= 0 : "orderbound = " + orderbound;
-        if (orderbound < 0) return new RowSet(rowdef);
+        if (orderbound < 0) return new RowSet(rowdef); // error
        final byte[] chunkcache = new byte[size * rowdef.objectsize];
        //assert b.length - exportOverheadSize == size * rowdef.objectsize : "b.length = " + b.length + ", size * rowdef.objectsize = " + size * rowdef.objectsize;
        if (b.length - exportOverheadSize != size * rowdef.objectsize) {
--- a/source/net/yacy/kelondro/rwi/IndexCell.java
+++ b/source/net/yacy/kelondro/rwi/IndexCell.java
@ -256,12 +256,6 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
        return removed + (reduced / this.array.rowdef().objectsize);
    }

-    public int remove(byte[] termHash, Set<String> urlHashes) throws IOException {
-        int removed = this.ram.remove(termHash, urlHashes);
-        int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashes));
-        return removed + (reduced / this.array.rowdef().objectsize);
-    }
-
    public boolean remove(byte[] termHash, byte[] urlHashBytes) throws IOException {
        boolean removed = this.ram.remove(termHash, urlHashBytes);
        int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashBytes));
@ -276,16 +270,6 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
            this.urlHashes = urlHashes;
        }
        
-        public RemoveRewriter(Set<String> urlHashes) {
-            this.urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
-            for (String s: urlHashes)
-                try {
-                    this.urlHashes.put(s.getBytes());
-                } catch (RowSpaceExceededException e) {
-                    Log.logException(e);
-                }
-        }
-        
        public RemoveRewriter(byte[] urlHashBytes) {
            this.urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
            try {
@ -296,6 +280,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
        }
        
        public ReferenceContainer<ReferenceType> rewrite(ReferenceContainer<ReferenceType> container) {
+            container.sort();
            container.removeEntries(urlHashes);
            return container;
        }
--- a/source/net/yacy/kelondro/rwi/ReferenceContainer.java
+++ b/source/net/yacy/kelondro/rwi/ReferenceContainer.java
@ -188,13 +188,6 @@ public class ReferenceContainer<ReferenceType extends Reference> extends RowSet
        return count;
    }

-    public int removeEntries(final Set<String> urlHashes) {
-        int count = 0;
-        final Iterator<String> i = urlHashes.iterator();
-        while (i.hasNext()) count += (remove(i.next().getBytes()) == null) ? 0 : 1;
-        return count;
-    }
-
    public Iterator<ReferenceType> entries() {
        // returns an iterator of indexRWIEntry objects
        return new entryIterator();
--- a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java
+++ b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java
@ -366,27 +366,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
        }
        return 0;
    }
- 
-    public int remove(final byte[] termHash, final Set<String> urlHashes) {
-        assert this.cache != null;
-        if (urlHashes.isEmpty()) return 0;
-        ByteArray tha = new ByteArray(termHash);
-        int count;
-        synchronized (cache) {
-            final ReferenceContainer<ReferenceType> c = cache.get(tha);
-            if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) {
-                // removal successful
-                if (c.isEmpty()) {
-                    delete(termHash);
-                } else {
-                    cache.put(tha, c);
-                }
-                return count;
-            }
-        }
-        return 0;
-    }
- 
+
    public void add(final ReferenceContainer<ReferenceType> container) throws RowSpaceExceededException {
        // this puts the entries into the cache
    	assert this.cache != null;