some redesign of the search-fail-response mechanism:

when a search fails for a single url because the snippet cannot be generated, then the url reference is deleted from the index. This mechanism was redesign and enhanced. The process now also writes into the work tables into the table searchfl to prepare a re-indexing mechanism. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7364 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · db99db4be9
parent 4915d1781a
commit db99db4be9
13 changed files with 207 additions and 70 deletions
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@ -291,7 +291,7 @@ public final class search {
            yacyChannel.channels(yacyChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), ""));
            
            // make event
-            theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, null, abstracts.length() > 0, sb.loader);
+            theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, sb.crawlResults, null, abstracts.length() > 0, sb.loader);
            
            // set statistic details of search result and find best result index set
            joincount = theSearch.getRankingResult().getLocalIndexCount() - theSearch.getRankingResult().getMissCount();
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -527,7 +527,7 @@ public class yacysearch {
                theQuery.setOffset(0); // in case that this is a new search, always start without a offset 
                offset = 0;
            }
-            final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
+            final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
            try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
            
            if (offset == 0) {
--- a/source/de/anomic/data/WorkTables.java
+++ b/source/de/anomic/data/WorkTables.java
@ -36,8 +36,12 @@ import java.util.Map;

 import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.kelondro.blob.Tables;
+import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.data.word.WordReference;
+import net.yacy.kelondro.index.HandleSet;
 import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
+import net.yacy.kelondro.rwi.IndexCell;
 import net.yacy.kelondro.util.DateFormatter;
 import de.anomic.server.serverObjects;

@ -64,6 +68,12 @@ public class WorkTables extends Tables {
    
    public final static String TABLE_ACTIVECRAWLS_NAME = "crawljobsActive";
    public final static String TABLE_PASSIVECRAWLS_NAME = "crawljobsPassive";
+
+    public final static String TABLE_SEARCH_FAILURE_NAME = "searchfl";
+    public final static String TABLE_SEARCH_FAILURE_COL_URL = "url";
+    public final static String TABLE_SEARCH_FAILURE_COL_DATE = "date";
+    public final static String TABLE_SEARCH_FAILURE_COL_WORDS = "words";
+    public final static String TABLE_SEARCH_FAILURE_COL_COMMENT = "comment";
    
    public YMarkTables bookmarks;
    
@ -283,4 +293,35 @@ public class WorkTables extends Tables {
        d -= d % 60000; // remove seconds
        row.put(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, new Date(d));
    }
+    
+    public void failURLsRegisterMissingWord(IndexCell<WordReference> indexCell, final DigestURI url, HandleSet queryHashes, final String reason) {
+
+        // remove words from index
+        for (byte[] word: queryHashes) {
+            indexCell.removeDelayed(word, url.hash());
+        }
+        
+        // insert information about changed url into database
+        try {
+            // create and insert new entry
+            Data data = new Data();
+            byte[] date = DateFormatter.formatShortMilliSecond(new Date()).getBytes();
+            data.put(TABLE_SEARCH_FAILURE_COL_URL, url.toNormalform(true, false));
+            data.put(TABLE_SEARCH_FAILURE_COL_DATE, date);
+            data.put(TABLE_SEARCH_FAILURE_COL_WORDS, queryHashes.export());
+            data.put(TABLE_SEARCH_FAILURE_COL_COMMENT, reason.getBytes());
+            super.insert(TABLE_SEARCH_FAILURE_NAME, url.hash(),  data);
+        } catch (IOException e) {
+            Log.logException(e);
+        }
+    }
+    
+    public boolean failURLsContains(byte[] urlhash) {
+        try {
+            return super.has(TABLE_SEARCH_FAILURE_NAME, urlhash);
+        } catch (IOException e) {
+            Log.logException(e);
+            return false;
+        }
+    }
 }
--- a/source/de/anomic/search/RankingProcess.java
+++ b/source/de/anomic/search/RankingProcess.java
@ -83,8 +83,8 @@ public final class RankingProcess extends Thread {
    //private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process
    
    private final DynamicScore<String> ref;  // reference score computation for the commonSense heuristic
-    private final DynamicScore<String> hostNavigator;
-    private final Map<String, String> hostResolver;
+    private final DynamicScore<String> hostNavigator; // a counter for the appearance of the host hash
+    private final Map<String, String> hostResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash
    private final DynamicScore<String> authorNavigator;
    private final DynamicScore<String> namespaceNavigator;
    private final ReferenceOrder order;
@ -406,10 +406,10 @@ public final class RankingProcess extends Thread {
                
                // in case that we do not have e catchall filter for urls
                // we must also construct the domain navigator here
-                if (query.sitehash == null) {
-                    this.hostNavigator.inc(new String(urlhash, 6, 6));
-                    this.hostResolver.put(new String(urlhash, 6, 6), new String(urlhash));
-                }
+                //if (query.sitehash == null) {
+                //    this.hostNavigator.inc(new String(urlhash, 6, 6));
+                //    this.hostResolver.put(new String(urlhash, 6, 6), new String(urlhash));
+                //}
            }
            
            // check for more errors
@ -433,7 +433,9 @@ public final class RankingProcess extends Thread {
                (query.constraint.get(Condenser.flag_cat_indexof)) &&
                (!(pagetitle.startsWith("index of")))) {
                final Iterator<byte[]> wi = query.queryHashes.iterator();
-                while (wi.hasNext()) try { this.query.getSegment().termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {}
+                while (wi.hasNext()) {
+                    this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash());
+                }
                continue;
            }
            
@ -526,7 +528,7 @@ public final class RankingProcess extends Thread {
    
    public int getRemoteResourceSize() {
        // the number of all hits in all the remote peers
-        return this.remote_resourceSize;
+        return Math.max(this.remote_resourceSize, this.remote_indexCount);
    }
    
    public int getRemotePeerCount() {
@ -560,7 +562,9 @@ public final class RankingProcess extends Thread {
            urlhash = this.hostResolver.get(domhash);
            row = urlhash == null ? null : this.query.getSegment().urlMetadata().load(urlhash.getBytes(), null, 0);
            hostname = row == null ? null : row.metadata().url().getHost();
-            if (hostname != null) result.set(hostname, this.hostNavigator.get(domhash));
+            if (hostname != null) {
+                result.set(hostname, this.hostNavigator.get(domhash));
+            }
        }
        if (result.size() < 2) result.clear(); // navigators with one entry are not useful
        return result;
--- a/source/de/anomic/search/ResultFetcher.java
+++ b/source/de/anomic/search/ResultFetcher.java
@ -45,6 +45,7 @@ import net.yacy.kelondro.util.EventTracker;
 import net.yacy.repository.LoaderDispatcher;

 import de.anomic.crawler.CrawlProfile;
+import de.anomic.data.WorkTables;
 import de.anomic.yacy.yacySeedDB;
 import de.anomic.yacy.graphics.ProfilingGraph;

@ -54,13 +55,13 @@ public class ResultFetcher {
    final RankingProcess  rankingProcess; // ordered search results, grows dynamically as all the query threads enrich this container
    QueryParams     query;
    private final yacySeedDB      peers;
+    private final WorkTables workTables;
    
    // result values
    protected final LoaderDispatcher        loader;
    protected       Worker[]                workerThreads;
    protected final WeakPriorityBlockingQueue<ResultEntry>  result;
    protected final WeakPriorityBlockingQueue<MediaSnippet> images; // container to sort images by size
-    protected final HandleSet               failedURLs; // a set of urlhashes that could not been verified during search
    protected final HandleSet               snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
    long urlRetrievalAllTime;
    long snippetComputationAllTime;
@ -71,19 +72,20 @@ public class ResultFetcher {
            RankingProcess rankedCache,
            final QueryParams query,
            final yacySeedDB peers,
+            final WorkTables workTables,
            final int taketimeout) {
    	assert query != null;
        this.loader = loader;
    	this.rankingProcess = rankedCache;
    	this.query = query;
        this.peers = peers;
+        this.workTables = workTables;
        this.taketimeout = taketimeout;
        
        this.urlRetrievalAllTime = 0;
        this.snippetComputationAllTime = 0;
        this.result = new WeakPriorityBlockingQueue<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
        this.images = new WeakPriorityBlockingQueue<MediaSnippet>(-1);
-        this.failedURLs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
        
        // snippets do not need to match with the complete query hashes,
        // only with the query minus the stopwords which had not been used for the search
@ -331,7 +333,7 @@ public class ResultFetcher {
                        System.out.println("page == null");
                        break; // no more available
                    }
-                    if (failedURLs.has(page.hash())) continue;
+                    if (workTables.failURLsContains(page.hash())) continue;

                    loops++;
                    final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0
@ -408,7 +410,9 @@ public class ResultFetcher {
                return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
            } else {
                // problems with snippet fetch
-                registerFailure(page.hash(), "no text snippet for URL " + metadata.url() + "; errorCode = " + snippet.getErrorCode());
+                String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
+                this.workTables.failURLsRegisterMissingWord(query.getSegment().termIndex(), metadata.url(), query.queryHashes, reason);
+                Log.logInfo("SEARCH", "sorted out url " + metadata.url().toNormalform(true, false) + " during search: " + reason);
                return null;
            }
        } else {
@ -425,19 +429,12 @@ public class ResultFetcher {
                return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
            } else {
                // problems with snippet fetch
-                registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
+                String reason = "no media snippet";
+                this.workTables.failURLsRegisterMissingWord(query.getSegment().termIndex(), metadata.url(), query.queryHashes, reason);
+                Log.logInfo("SEARCH", "sorted out url " + metadata.url().toNormalform(true, false) + " during search: " + reason);
                return null;
            }
        }
        // finished, no more actions possible here
    }
-    
-    private void registerFailure(final byte[] urlhash, final String reason) {
-        try {
-            this.failedURLs.put(urlhash);
-        } catch (RowSpaceExceededException e) {
-            Log.logException(e);
-        }
-        Log.logInfo("SEARCH", "sorted out urlhash " + new String(urlhash) + " during search: " + reason);
-    }
 }
--- a/source/de/anomic/search/SearchEvent.java
+++ b/source/de/anomic/search/SearchEvent.java
@ -26,7 +26,6 @@

 package de.anomic.search;

-import java.io.IOException;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.SortedMap;
@ -39,8 +38,6 @@ import java.util.concurrent.TimeUnit;
 import net.yacy.cora.storage.StaticScore;
 import net.yacy.document.LargeNumberCache;
 import net.yacy.kelondro.data.word.WordReference;
-import net.yacy.kelondro.index.HandleSet;
-import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.rwi.ReferenceContainer;
@ -50,6 +47,7 @@ import net.yacy.kelondro.util.SetTools;
 import net.yacy.repository.LoaderDispatcher;

 import de.anomic.crawler.ResultURLs;
+import de.anomic.data.WorkTables;
 import de.anomic.yacy.yacySearch;
 import de.anomic.yacy.yacySeedDB;
 import de.anomic.yacy.dht.FlatWordPartitionScheme;
@ -68,6 +66,7 @@ public final class SearchEvent {
    private long eventTime;
    private QueryParams query;
    private final yacySeedDB peers;
+    private final WorkTables workTables;
    private RankingProcess rankingProcess; // ordered search results, grows dynamically as all the query threads enrich this container
    private ResultFetcher resultFetcher;
    
@ -86,12 +85,14 @@ public final class SearchEvent {
    
    public SearchEvent(final QueryParams query,
                             final yacySeedDB peers,
+                             final WorkTables workTables,
                             final ResultURLs crawlResults,
                             final SortedMap<byte[], String> preselectedPeerHashes,
                             final boolean generateAbstracts,
                             final LoaderDispatcher loader) {
        this.eventTime = System.currentTimeMillis(); // for lifetime check
        this.peers = peers;
+        this.workTables = workTables;
        this.crawlResults = crawlResults;
        this.query = query;
        this.secondarySearchSuperviser = (query.queryHashes.size() > 1) ? new SecondarySearchSuperviser() : null; // generate abstracts only for combined searches
@ -153,7 +154,7 @@ public final class SearchEvent {
            }
            
            // start worker threads to fetch urls and snippets
-            this.resultFetcher = new ResultFetcher(loader, this.rankingProcess, query, peers, 3000);
+            this.resultFetcher = new ResultFetcher(loader, this.rankingProcess, query, this.peers, this.workTables, 3000);
        } else {
            // do a local search
            this.rankingProcess = new RankingProcess(this.query, this.order, max_results_preparation);
@ -197,7 +198,7 @@ public final class SearchEvent {
            }
            
            // start worker threads to fetch urls and snippets
-            this.resultFetcher = new ResultFetcher(loader, this.rankingProcess, query, peers, 500);
+            this.resultFetcher = new ResultFetcher(loader, this.rankingProcess, query, this.peers, this.workTables, 500);
        }
         
        // clean up events
@ -253,28 +254,6 @@ public final class SearchEvent {
       if (this.IACount != null) this.IACount.clear();
       if (this.IAResults != null) this.IAResults.clear();
       if (this.heuristics != null) this.heuristics.clear();
-       
-       // execute deletion of failed words
-       int rw = this.resultFetcher.failedURLs.size();
-       if (rw > 0) {
-           long start = System.currentTimeMillis();
-           final HandleSet removeWords = query.queryHashes;
-           try {
-               removeWords.putAll(query.excludeHashes);
-           } catch (RowSpaceExceededException e1) {
-               Log.logException(e1);
-           }
-           try {
-               final Iterator<byte[]> j = removeWords.iterator();
-               // remove the same url hashes for multiple words
-               while (j.hasNext()) {
-                   this.query.getSegment().termIndex().remove(j.next(), this.resultFetcher.failedURLs);
-               }                    
-           } catch (IOException e) {
-               Log.logException(e);
-           }
-           Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words in " + (System.currentTimeMillis() - start) + " milliseconds");
-       }
   }
   
   public Iterator<Map.Entry<byte[], String>> abstractsString() {
--- a/source/de/anomic/search/SearchEventCache.java
+++ b/source/de/anomic/search/SearchEventCache.java
@ -38,6 +38,7 @@ import net.yacy.kelondro.util.MemoryControl;
 import net.yacy.repository.LoaderDispatcher;

 import de.anomic.crawler.ResultURLs;
+import de.anomic.data.WorkTables;
 import de.anomic.yacy.yacySeedDB;

 public class SearchEventCache {
@ -101,6 +102,7 @@ public class SearchEventCache {
    public static SearchEvent getEvent(
            final QueryParams query,
            final yacySeedDB peers,
+            final WorkTables workTables,
            final ResultURLs crawlResults,
            final SortedMap<byte[], String> preselectedPeerHashes,
            final boolean generateAbstracts,
@ -126,7 +128,7 @@ public class SearchEventCache {
        }
        if (event == null) {
            // start a new event
-            event = new SearchEvent(query, peers, crawlResults, preselectedPeerHashes, generateAbstracts, loader);
+            event = new SearchEvent(query, peers, workTables, crawlResults, preselectedPeerHashes, generateAbstracts, loader);
        }
    
        return event;
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -230,7 +230,7 @@ public final class Switchboard extends serverSwitch {
    public  LinkedBlockingQueue<String>    trail;
    public  yacySeedDB                     peers;
    public  WorkTables                     tables;
-    public  SortedMap<byte[], DigestURI>     intranetURLs = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
+    public  SortedMap<byte[], DigestURI>   intranetURLs = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
    
    public WorkflowProcessor<indexingQueueEntry> indexingDocumentProcessor;
    public WorkflowProcessor<indexingQueueEntry> indexingCondensementProcessor;
--- a/source/net/yacy/kelondro/index/HandleSet.java
+++ b/source/net/yacy/kelondro/index/HandleSet.java
@ -64,10 +64,19 @@ public final class HandleSet implements Iterable<byte[]>, Cloneable {
        this.index = index;
    }

+    public HandleSet(Row rowdef, byte[] b) {
+        this.rowdef = rowdef;
+        this.index = RowSet.importRowSet(b, this.rowdef);
+    }
+
    public HandleSet clone() {
        return new HandleSet(this.rowdef, this.index.clone());
    }
    
+    public byte[] export() {
+        return index.exportCollection();
+    }
+    
    /**
     * initialize a HandleSet with the content of a dump
     * @param keylength
--- a/source/net/yacy/kelondro/rwi/AbstractIndex.java
+++ b/source/net/yacy/kelondro/rwi/AbstractIndex.java
@ -46,6 +46,15 @@ public abstract class AbstractIndex <ReferenceType extends Reference> implements
        this.factory = factory;
    }
    
+    public void removeDelayed(final HandleSet termHashes, final byte[] urlHashBytes) throws IOException {
+        // remove the same url hashes for multiple words
+        // this is mainly used when correcting a index after a search
+        final Iterator<byte[]> i = termHashes.iterator();
+        while (i.hasNext()) {
+            removeDelayed(i.next(), urlHashBytes);
+        }
+    }
+    
    public int remove(final HandleSet termHashes, final byte[] urlHashBytes) throws IOException {
        // remove the same url hashes for multiple words
        // this is mainly used when correcting a index after a search
--- a/source/net/yacy/kelondro/rwi/Index.java
+++ b/source/net/yacy/kelondro/rwi/Index.java
@ -106,6 +106,7 @@ public interface Index <ReferenceType extends Reference> {
 	 * @throws IOException
 	 */
    public boolean remove(byte[] termHash, byte[] referenceHash) throws IOException;
+    public void removeDelayed(byte[] termHash, byte[] referenceHash) throws IOException;
    
    /**
     * remove a set of reference entries for a given word
@ -115,8 +116,11 @@ public interface Index <ReferenceType extends Reference> {
     * @throws IOException
     */
    public int remove(final byte[] termHash, HandleSet referenceHashes) throws IOException;
+    public void removeDelayed(final byte[] termHash, HandleSet referenceHashes) throws IOException;
    public int remove(final HandleSet termHashes, final byte[] urlHashBytes) throws IOException;
-    
+    public void removeDelayed(final HandleSet termHashes, final byte[] urlHashBytes) throws IOException;
+
+    public void removeDelayed() throws IOException;
    /**
     * iterate all references from the beginning of a specific word hash
     * @param startHash
--- a/source/net/yacy/kelondro/rwi/IndexCell.java
+++ b/source/net/yacy/kelondro/rwi/IndexCell.java
@ -28,6 +28,8 @@ package net.yacy.kelondro.rwi;

 import java.io.File;
 import java.io.IOException;
+import java.util.Map;
+import java.util.TreeMap;
 import java.util.concurrent.Semaphore;

 import net.yacy.cora.storage.ComparableARC;
@ -71,6 +73,8 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
    private final int                                    writeBufferSize;
    private       Semaphore                              dumperSemaphore = new Semaphore(1);
    private       Semaphore                              cleanerSemaphore = new Semaphore(1);
+    private final Map<byte[], HandleSet>                 failedURLs; // mapping from word hashes to a list of url hashes
+    
    
    public IndexCell(
            final File cellPath,
@ -96,7 +100,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
        this.targetFileSize = targetFileSize;
        this.maxFileSize = maxFileSize;
        this.writeBufferSize = writeBufferSize;
-        //cleanCache();
+        this.failedURLs = new TreeMap<byte[], HandleSet>(URIMetadataRow.rowdef.objectOrder);
    }

    
@ -169,6 +173,13 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
        ReferenceContainer<ReferenceType> countRam = this.ram.get(termHash, null);
        assert countRam == null || countRam.size() >= 0;
        int c = countRam == null ? countFile : countFile + countRam.size();
+        // exclude entries from delayed remove
+        synchronized (this.failedURLs) {
+            HandleSet s = this.failedURLs.get(termHash);
+            if (s != null) c -= s.size();
+            if (c < 0) c = 0;
+        }
+        // put count result into cache
        this.countCache.put(termHash, c);
        return c;
    }
@ -188,22 +199,31 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
        } catch (RowSpaceExceededException e2) {
            Log.logException(e2);
        }
-        if (c1 == null) {
-            if (c0 == null) return null;
-            return c0;
-        }
-        if (c0 == null) return c1;
-        try {
-            return c1.merge(c0);
-        } catch (RowSpaceExceededException e) {
-            // try to free some ram
+        ReferenceContainer<ReferenceType> result = null;
+        if (c0 != null && c1 != null) {
            try {
-                return c1.merge(c0);
-            } catch (RowSpaceExceededException e1) {
-                // go silently over the problem
-                return (c1.size() > c0.size()) ? c1: c0;
+                result = c1.merge(c0);
+            } catch (RowSpaceExceededException e) {
+                // try to free some ram
+                try {
+                    result = c1.merge(c0);
+                } catch (RowSpaceExceededException e1) {
+                    // go silently over the problem
+                    result = (c1.size() > c0.size()) ? c1: c0;
+                }
            }
+        } else if (c0 != null) {
+            result = c0;
+        } else if (c1 != null) {
+            result = c1;
+        }
+        if (result == null) return null;
+        // remove the failed urls
+        synchronized (this.failedURLs) {
+            HandleSet s = this.failedURLs.get(termHash);
+            if (s != null) result.removeEntries(s);
        }
+        return result;
    }

    /**
@ -212,6 +232,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
     * @throws IOException 
     */
    public ReferenceContainer<ReferenceType> delete(byte[] termHash) throws IOException {
+        removeDelayed();
        ReferenceContainer<ReferenceType> c1 = null;
        try {
            c1 = this.array.get(termHash);
@ -238,6 +259,60 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
        }
    }
    
+    public void removeDelayed(byte[] termHash, HandleSet urlHashes) {
+        HandleSet r;
+        synchronized (failedURLs) {
+            r = this.failedURLs.get(termHash);
+        }
+        if (r == null) {
+            r = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
+        }
+        try {
+            r.putAll(urlHashes);
+        } catch (RowSpaceExceededException e) {
+            try {remove(termHash, urlHashes);} catch (IOException e1) {}
+            return;
+        }
+        synchronized (failedURLs) {
+            this.failedURLs.put(termHash, r);
+        }
+    }
+    
+    public void removeDelayed(byte[] termHash, byte[] urlHashBytes) {
+        HandleSet r;
+        synchronized (failedURLs) {
+            r = this.failedURLs.get(termHash);
+        }
+        if (r == null) {
+            r = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
+        }
+        try {
+            r.put(urlHashBytes);
+        } catch (RowSpaceExceededException e) {
+            try {remove(termHash, urlHashBytes);} catch (IOException e1) {}
+            return;
+        }
+        synchronized (failedURLs) {
+            this.failedURLs.put(termHash, r);
+        }
+    }
+    
+    public void removeDelayed() throws IOException {
+        HandleSet words = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
+        synchronized (failedURLs) {
+            for (byte[] b: failedURLs.keySet()) try {words.put(b);} catch (RowSpaceExceededException e) {}
+        }
+        
+        for (byte[] b: words) {
+            HandleSet urls;
+            synchronized (failedURLs) {
+                urls = failedURLs.remove(b);
+            }
+            remove(b, urls);
+        }
+        this.countCache.clear();
+    }
+    
    /**
     * remove url references from a selected word hash. this deletes also in the BLOB
     * files, which means that there exists new gap entries after the deletion
@ -246,6 +321,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
     * @throws IOException 
     */
    public int remove(byte[] termHash, HandleSet urlHashes) throws IOException {
+        this.countCache.remove(termHash);
        int removed = this.ram.remove(termHash, urlHashes);
        int reduced;
        //final long am = this.array.mem();
@ -260,6 +336,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
    }

    public boolean remove(byte[] termHash, byte[] urlHashBytes) throws IOException {
+        this.countCache.remove(termHash);
        boolean removed = this.ram.remove(termHash, urlHashBytes);
        int reduced;
        //final long am = this.array.mem();
@ -333,6 +410,8 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
     * @throws IOException 
     */
    public synchronized void clear() throws IOException {
+        this.countCache.clear();
+        this.failedURLs.clear();
        this.ram.clear();
        this.array.clear();
    }
@ -343,6 +422,8 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
     * and is composed of the current date and the cell salt
     */
    public synchronized void close() {
+        this.countCache.clear();
+        try {removeDelayed();} catch (IOException e) {}
        if (!this.ram.isEmpty()) this.ram.dump(this.array.newContainerBLOBFile(), (int) Math.min(MemoryControl.available() / 3, writeBufferSize), true);
        // close all
        this.ram.close();
@ -395,6 +476,8 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
                    (this.ram.size() > 3000 && !MemoryControl.request(80L * 1024L * 1024L, false)) ||
                    (this.ram.size() > 0 && this.lastDump + dumpCycle < t)) try {
                    this.lastDump = System.currentTimeMillis();
+                    // removed delayed
+                    try {removeDelayed();} catch (IOException e) {}
                    // dump the ram
                    File dumpFile = this.array.newContainerBLOBFile();
                    // a critical point: when the ram is handed to the dump job,
--- a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java
+++ b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java
@ -338,6 +338,9 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
        return cache.remove(new ByteArray(termHash));
    }

+    public void removeDelayed(final byte[] termHash, final byte[] urlHashBytes) {
+        remove(termHash, urlHashBytes);
+    }
    public boolean remove(final byte[] termHash, final byte[] urlHashBytes) {
        assert this.cache != null;
        ByteArray tha = new ByteArray(termHash);
@ -355,7 +358,11 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
        }
        return false;
    }
- 
+
+    public void removeDelayed(final byte[] termHash, final HandleSet urlHashes) {
+        remove(termHash, urlHashes);
+    }
+
    public int remove(final byte[] termHash, final HandleSet urlHashes) {
        assert this.cache != null;
        if (urlHashes.isEmpty()) return 0;
@ -376,6 +383,8 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
        return 0;
    }

+    public void removeDelayed() {}
+
    public void add(final ReferenceContainer<ReferenceType> container) throws RowSpaceExceededException {
        // this puts the entries into the cache
    	assert this.cache != null;