refactoring of parsing-condensing-indexing process:

- separated parts - removed storagePeer function next step will be parallelization of processes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4600 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 8d6a13bc07
parent d3b06913ec
commit 8d6a13bc07
4 changed files with 244 additions and 347 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -767,9 +767,6 @@ CRDist1Method  = 9
 CRDist1Percent = 30
 CRDist1Target  = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080
 # Hash of the peer, you would like to store to the data your installation collected.
 storagePeerHash =
 # Search sequence settings
 # collection:
 # time = time to get a RWI out of RAM cache, assortments and WORDS files
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -117,14 +117,11 @@ import de.anomic.data.messageBoard;
 import de.anomic.data.userDB;
 import de.anomic.data.wikiBoard;
 import de.anomic.data.wiki.wikiParser;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.http.httpHeader;
 import de.anomic.http.httpRemoteProxyConfig;
 import de.anomic.http.httpc;
 import de.anomic.http.httpd;
 import de.anomic.http.httpdRobotsTxtConfig;
 import de.anomic.index.indexContainer;
 import de.anomic.index.indexRWIRowEntry;
 import de.anomic.index.indexURLEntry;
 import de.anomic.kelondro.kelondroBitfield;
 import de.anomic.kelondro.kelondroCache;
@ -137,7 +134,6 @@ import de.anomic.plasma.crawler.plasmaCrawlQueues;
 import de.anomic.plasma.crawler.plasmaProtocolLoader;
 import de.anomic.plasma.dbImport.dbImportManager;
 import de.anomic.plasma.parser.ParserException;
 import de.anomic.plasma.plasmaCondenser.wordStatProp;
 import de.anomic.plasma.urlPattern.defaultURLPattern;
 import de.anomic.plasma.urlPattern.plasmaURLPattern;
 import de.anomic.server.serverAbstractSwitch;
@ -175,9 +171,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
    // 5) local prefetch/crawling (initiator is own seedHash)
    // 6) local fetching for global crawling (other known or unknown initiator)
    public static final int PROCESSCASE_0_UNKNOWN = 0;
    public static final int PROCESSCASE_1_GLOBAL_CRAWLING = 1;
    public static final int PROCESSCASE_2_SEARCH_QUERY_RESULT = 2;
    public static final int PROCESSCASE_3_INDEX_TRANSFER_RESULT = 3;
    public static final int PROCESSCASE_4_PROXY_LOAD = 4;
    public static final int PROCESSCASE_5_LOCAL_CRAWLING = 5;
    public static final int PROCESSCASE_6_GLOBAL_CRAWLING = 6;
@ -653,14 +646,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
    public static final String CRAWLER_THREADS_ACTIVE_MAX       = "crawler.MaxActiveThreads";
    public static final String OWN_SEED_FILE                    = "yacyOwnSeedFile";
    /**
     * <p><code>public static final String <strong>STORAGE_PEER_HASH</strong> = "storagePeerHash"</code></p>
     * <p>Name of the setting holding the Peer-Hash where indexes shall be transferred after indexing a webpage. If this setting
     * is empty, the Storage Peer function is disabled</p>
     */
    public static final String STORAGE_PEER_HASH                = "storagePeerHash";
    public static final String YACY_MODE_DEBUG                  = "yacyDebugMode";
    public static final String WORDCACHE_INIT_COUNT             = "wordCacheInitCount";
    /**
     * <p><code>public static final String <strong>WORDCACHE_MAX_COUNT</strong> = "wordCacheMaxCount"</code></p>
@ -1825,15 +1811,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            }
            // generate a dht chunk
-            if (
+            if ((dhtShallTransfer() == null) && (
                    (dhtShallTransfer() == null) &&
                    (
                    (this.dhtTransferChunk == null) ||
                    (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_UNDEFINED) ||
                    // (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE) ||
                    (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_FAILED)
-                    )
+               )) {
            ) {
                // generate new chunk
                int minChunkSize = (int) getConfigLong(INDEX_DIST_CHUNK_SIZE_MIN, 30);
                dhtTransferChunk = new plasmaDHTChunk(this.log, wordIndex, minChunkSize, dhtTransferIndexCount, 5000);
@ -1867,7 +1850,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            }
            // parse and index the resource
-            processResourceStack(nextentry);
+            plasmaParserDocument document = parseDocument(nextentry);
            if (document != null) {
                plasmaCondenser condensement = condenseDocument(nextentry, document);
                if (condensement != null) {
                    indexDocument(nextentry, document, condensement);
                }
            }
            return true;
        } catch (InterruptedException e) {
            log.logInfo("DEQUEUE: Shutdown detected.");
@ -2078,7 +2067,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
        }
    }
-    private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException {
+    private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry) throws InterruptedException, ParserException {
        // the mimetype of this entry
        String mimeType = entry.getMimeType();
@ -2097,71 +2086,43 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
        return doc;
    }
-    private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
+    private plasmaParserDocument parseDocument(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
        plasmaParserDocument document = null;
        try {
            // work off one stack entry with a fresh resource
            long stackStartTime = 0, stackEndTime = 0,
-            parsingStartTime = 0, parsingEndTime = 0,
+            parsingStartTime = 0, parsingEndTime = 0;
-            indexingStartTime = 0, indexingEndTime = 0,
+            int processCase = entry.processCase();
            storageStartTime = 0, storageEndTime = 0;
            // we must distinguish the following cases: resource-load was initiated by
            // 1) global crawling: the index is extern, not here (not possible here)
            // 2) result of search queries, some indexes are here (not possible here)
            // 3) result of index transfer, some of them are here (not possible here)
            // 4) proxy-load (initiator is "------------")
            // 5) local prefetch/crawling (initiator is own seedHash)
            // 6) local fetching for global crawling (other known or unknwon initiator)
            int processCase = PROCESSCASE_0_UNKNOWN;
            yacySeed initiatorPeer = null;
            String initiatorPeerHash = (entry.proxy()) ? yacyURL.dummyHash : entry.initiator();
            if (initiatorPeerHash.equals(yacyURL.dummyHash)) {
                // proxy-load
                processCase = PROCESSCASE_4_PROXY_LOAD;
            } else if (initiatorPeerHash.equals(yacyCore.seedDB.mySeed().hash)) {
                // normal crawling
                processCase = PROCESSCASE_5_LOCAL_CRAWLING;
            } else {
                // this was done for remote peer (a global crawl)
                initiatorPeer = yacyCore.seedDB.getConnected(initiatorPeerHash);
                processCase = PROCESSCASE_6_GLOBAL_CRAWLING;
            }
            log.logFine("processResourceStack processCase=" + processCase +
                    ", depth=" + entry.depth() +
                    ", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().generalDepth())) +
                    ", filter=" + ((entry.profile() == null) ? "null" : entry.profile().generalFilter()) +
-                    ", initiatorHash=" + initiatorPeerHash +
+                    ", initiatorHash=" + entry.initiator() +
                    //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
                    ", url=" + entry.url()); // DEBUG
-            /* =========================================================================
+            // PARSE CONTENT
             * PARSE CONTENT
             * ========================================================================= */
            parsingStartTime = System.currentTimeMillis();
            try {
-                document = this.parseResource(entry, initiatorPeerHash);
+                document = this.parseResource(entry);
-                if (document == null) return;
+                if (document == null) return null;
            } catch (ParserException e) {
                this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
-                addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new kelondroBitfield());
+                addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.anchorName(), e.getErrorCode(), new kelondroBitfield());
                if (document != null) {
                    document.close();
                    document = null;
                }
-                return;
+                return null;
            }
            parsingEndTime = System.currentTimeMillis();            
-            // getting the document date
+            // get the document date
            Date docDate = entry.getModificationDate();
-            /* =========================================================================
+            // put anchors on crawl stack
             * put anchors on crawl stack
             * ========================================================================= */            
            stackStartTime = System.currentTimeMillis();
            if (
                    ((processCase == PROCESSCASE_4_PROXY_LOAD) || (processCase == PROCESSCASE_5_LOCAL_CRAWLING)) &&
@ -2179,18 +2140,47 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
                    nextEntry = i.next();
                    nextUrl = nextEntry.getKey();
                    // enqueue the hyperlink into the pre-notice-url db
-                    crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
+                    crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), entry.initiator(), nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
                }
-                log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
+                if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
-                        ", NEW CRAWL STACK SIZE IS " + crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
+                        ", NEW CRAWL STACK SIZE IS " + crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
                        ", STACKING TIME = " + (stackEndTime-stackStartTime) +
                        ", PARSING TIME = " + (parsingEndTime-parsingStartTime));
            }
            stackEndTime = System.currentTimeMillis();
        } catch (Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException)e;
            this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
        } catch (Error e) {
            this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
        } finally {
            checkInterruption();
-            /* =========================================================================
+            // The following code must be into the finally block, otherwise it will not be executed
-             * CREATE INDEX
+            // on errors!
-             * ========================================================================= */  
+
            // removing current entry from in process list
            synchronized (this.indexingTasksInProcess) {
                this.indexingTasksInProcess.remove(entry.urlHash());
            }
            // explicit delete/free resources
            if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) {
                plasmaHTCache.filesInUse.remove(entry.cacheFile());
                //plasmaHTCache.deleteURLfromCache(entry.url());
            }
            entry = null;
            if (document != null) try { document.close(); } catch (Exception e) {}
        }
        return document;
    }
    private plasmaCondenser condenseDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document) throws InterruptedException {
        // CREATE INDEX
        String dc_title = document.dc_title();
        yacyURL referrerURL = entry.referrerURL();
        int processCase = entry.processCase();
        String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR;
        if (processCase == PROCESSCASE_4_PROXY_LOAD) {
@ -2201,18 +2191,48 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            noIndexReason = entry.shallIndexCacheForCrawler();
        }
-            if (noIndexReason == null) {
+        if (noIndexReason != null) {
-                // strip out words
+            // check for interruption
-                indexingStartTime = System.currentTimeMillis();
+            checkInterruption();
            log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
            addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), entry.initiator(), dc_title, noIndexReason, new kelondroBitfield());
            /*
            if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
                if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
                yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
            }
            */
            document.close();
            document = null;
            return null;
        }
        // strip out words
        checkInterruption();
        log.logFine("Condensing for '" + entry.url().toNormalform(false, true) + "'");
-                plasmaCondenser condenser = new plasmaCondenser(document, entry.profile().indexText(), entry.profile().indexMedia());
+        plasmaCondenser condenser;
        try {
            condenser = new plasmaCondenser(document, entry.profile().indexText(), entry.profile().indexMedia());
        } catch (UnsupportedEncodingException e) {
            return null;
        }
        return condenser;
    }
    private void indexDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws InterruptedException {
        long indexingStartTime = 0, indexingEndTime = 0,
        storageStartTime = 0, storageEndTime = 0;
        // CREATE INDEX
        String dc_title = document.dc_title();
        yacyURL referrerURL = entry.referrerURL();
        Date docDate = entry.getModificationDate();
        int processCase = entry.processCase();
        // generate citation reference
        Integer[] ioLinks = webStructure.generateCitationReference(entry.url(), entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
                try {        
        // check for interruption
        checkInterruption();
@ -2241,13 +2261,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
                document.getVideolinks().size(),           // lvideo
                document.getApplinks().size()              // lapp
        );
-                    /* ========================================================================
+        
-                     * STORE URL TO LOADED-URL-DB
+        // STORE URL TO LOADED-URL-DB
-                     * ======================================================================== */
+        try {
            wordIndex.loadedURL.store(newEntry);
        } catch (IOException e) {
            log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
            addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, "error storing url: " + e.getMessage(), new kelondroBitfield());
            return;
        }
        wordIndex.loadedURL.stack(
                newEntry,                      // loaded url db entry
-                            initiatorPeerHash,              // initiator peer hash
+                entry.initiator(),             // initiator peer hash
                yacyCore.seedDB.mySeed().hash, // executor peer hash
                processCase                    // process case
        );                    
@ -2255,18 +2281,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
        // check for interruption
        checkInterruption();
-                    /* ========================================================================
+        // STORE WORD INDEX
-                     * STORE WORD INDEX
+        if ((!entry.profile().indexText()) && (!entry.profile().indexMedia())) {
-                     * ======================================================================== */
+            log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
-                    if (
+            addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
-                            (
+            return;
-                                    (processCase == PROCESSCASE_4_PROXY_LOAD) || 
+        }
                                    (processCase == PROCESSCASE_5_LOCAL_CRAWLING) || 
                                    (processCase == PROCESSCASE_6_GLOBAL_CRAWLING)
                            ) && 
                            ((entry.profile().indexText()) || (entry.profile().indexMedia()))
                    ) {
                        String urlHash = newEntry.hash();
        // remove stopwords                        
        log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
@ -2274,18 +2294,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
        storageStartTime = System.currentTimeMillis();
        int words = 0;
                        String storagePeerHash;
                        yacySeed seed;
-                        if (
+        // STORE PAGE INDEX INTO WORD INDEX DB
                                ((storagePeerHash = getConfig(STORAGE_PEER_HASH, null)) == null) ||
                                (storagePeerHash.trim().length() == 0) ||
                                ((seed = yacyCore.seedDB.getConnected(storagePeerHash)) == null)
                        ){
                            /* ========================================================================
                             * STORE PAGE INDEX INTO WORD INDEX DB
                             * ======================================================================== */
        words = wordIndex.addPageIndex(
                entry.url(),                                  // document url
                docDate,                                      // document mod date
@ -2297,86 +2307,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
                ioLinks[0].intValue(),                        // outlinkSame
                ioLinks[1].intValue()                         // outlinkOthers
        );
                        } else {
                            /* ========================================================================
                             * SEND PAGE INDEX TO STORAGE PEER
                             * ======================================================================== */                            
                            HashMap<String, indexURLEntry> urlCache = new HashMap<String, indexURLEntry>(1);
                            urlCache.put(newEntry.hash(), newEntry);
                            ArrayList<indexContainer> tmpContainers = new ArrayList<indexContainer>(condenser.words().size());
                            String language = yacyURL.language(entry.url());                            
                            char doctype = plasmaHTCache.docType(document.dc_format());
                            indexURLEntry.Components comp = newEntry.comp();
                            int urlLength = comp.url().toNormalform(true, true).length();
                            int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform(true, true)).length;
                            // iterate over all words
                            Iterator<Map.Entry<String, wordStatProp>> i = condenser.words().entrySet().iterator();
                            Map.Entry<String, wordStatProp> wentry;
                            plasmaCondenser.wordStatProp wordStat;
                            while (i.hasNext()) {
                                wentry = i.next();
                                String word = wentry.getKey();
                                wordStat = wentry.getValue();
                                String wordHash = plasmaCondenser.word2hash(word);
                                indexRWIRowEntry wordIdxEntry = new indexRWIRowEntry(
                                            urlHash,
                                            urlLength, urlComps,
                                            wordStat.count,
                                            document.dc_title().length(),
                                            condenser.words().size(),
                                            condenser.sentences().size(),
                                            wordStat.posInText,
                                            wordStat.posInPhrase,
                                            wordStat.numOfPhrase,
                                            docDate.getTime(),
                                            System.currentTimeMillis(),
                                            language,
                                            doctype,
                                            ioLinks[0].intValue(),
                                            ioLinks[1].intValue(),
                                            condenser.RESULT_FLAGS
                                        );
                                indexContainer wordIdxContainer = plasmaWordIndex.emptyContainer(wordHash, 1);
                                wordIdxContainer.add(wordIdxEntry);
                                tmpContainers.add(wordIdxContainer);
                            }
                            //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
                            words = condenser.words().size();
                            // transfering the index to the storage peer
                            indexContainer[] indexData = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]);
                            HashMap<String, Object> resultObj = yacyClient.transferIndex(
                                    seed,       // target seed
                                    indexData,  // word index data
                                    urlCache,   // urls
                                    true,       // gzip body
                                    120000      // transfer timeout 
                            );
                            // check for interruption
                            checkInterruption();
                            // if the transfer failed we try to store the index locally
                            String error = (String) resultObj.get("result");
                            if (error != null) {
                                words = wordIndex.addPageIndex(
                                        entry.url(),
                                        docDate, 
                                        (int) entry.size(),
                                        document, 
                                        condenser,
                                        yacyURL.language(entry.url()),
                                        plasmaHTCache.docType(document.dc_format()),
                                        ioLinks[0].intValue(), 
                                        ioLinks[1].intValue()
                                );
                            }
                            tmpContainers = null;
                        } //end: SEND PAGE INDEX TO STORAGE PEER
        storageEndTime = System.currentTimeMillis();
@ -2392,9 +2322,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
                    "\n\tMimeType: "  + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
                    "Size: " + document.getTextLength() + " bytes | " +
                    "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
-                                    "\n\tStackingTime:  " + (stackEndTime-stackStartTime) + " ms | " +
+                    "\n\tIndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
                                    "ParsingTime:  " + (parsingEndTime-parsingStartTime) + " ms | " +
                                    "IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
                    "StorageTime: " + (storageEndTime-storageStartTime) + " ms");
        }
@ -2403,6 +2331,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
        // check for interruption
        checkInterruption();
        yacySeed initiatorPeer = entry.initiatorPeer();
        // if this was performed for a remote crawl request, notify requester
        if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
@ -2410,70 +2339,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
            yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
        }
                    } else {
                        log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
                        addURLtoErrorDB(entry.url(), referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
                    }
                } catch (Exception ee) {
                    if (ee instanceof InterruptedException) throw (InterruptedException)ee;
                    // check for interruption
                    checkInterruption();
                    log.logSevere("Could not index URL " + entry.url() + ": " + ee.getMessage(), ee);
                    if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
                        if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
                        yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, "");
                    }
                    addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new kelondroBitfield());
                }
            } else {
                // check for interruption
                checkInterruption();
                log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
                addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, noIndexReason, new kelondroBitfield());
                if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
                    if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
                    yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
                }
            }
            document.close();
            document = null;
        } catch (Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException)e;
            this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
        } catch (Error e) {
            this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
        } finally {
            checkInterruption();
            // The following code must be into the finally block, otherwise it will not be executed
            // on errors!
            // removing current entry from in process list
            synchronized (this.indexingTasksInProcess) {
                this.indexingTasksInProcess.remove(entry.urlHash());
            }
            // removing current entry from notice URL queue
            /*
            boolean removed = noticeURL.remove(entry.urlHash()); // worked-off
            if (!removed) {
                log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect.");
            }
            */
            // explicit delete/free resources
            if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) {
                plasmaHTCache.filesInUse.remove(entry.cacheFile());
                //plasmaHTCache.deleteURLfromCache(entry.url());
            }
            entry = null;
            if (document != null) try { document.close(); } catch (Exception e) { /* ignore this */ }
        }
    }
    private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy");
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@ -57,6 +57,8 @@ import de.anomic.kelondro.kelondroRow;
 import de.anomic.kelondro.kelondroStack;
 import de.anomic.plasma.cache.IResourceInfo;
 import de.anomic.server.logging.serverLog;
 import de.anomic.yacy.yacyCore;
 import de.anomic.yacy.yacySeed;
 import de.anomic.yacy.yacySeedDB;
 import de.anomic.yacy.yacyURL;
@ -287,6 +289,17 @@ public class plasmaSwitchboardQueue {
            return initiator;
        }
        public yacySeed initiatorPeer() {
            if ((initiator == null) || (initiator.equals(yacyURL.dummyHash))) return null;
            if (initiator.equals(yacyCore.seedDB.mySeed().hash)) {
                // normal crawling
                return null;
            } else {
                // this was done for remote peer (a global crawl)
                return yacyCore.seedDB.getConnected(initiator);
            }
        }
        public int depth() {
            return depth;
        }
@ -342,6 +355,28 @@ public class plasmaSwitchboardQueue {
            return anchorName;
        }
        public int processCase() {
            // we must distinguish the following cases: resource-load was initiated by
            // 1) global crawling: the index is extern, not here (not possible here)
            // 2) result of search queries, some indexes are here (not possible here)
            // 3) result of index transfer, some of them are here (not possible here)
            // 4) proxy-load (initiator is "------------")
            // 5) local prefetch/crawling (initiator is own seedHash)
            // 6) local fetching for global crawling (other known or unknwon initiator)
            int processCase = plasmaSwitchboard.PROCESSCASE_0_UNKNOWN;
            if ((initiator == null) || (initiator.equals(yacyURL.dummyHash))) {
                // proxy-load
                processCase = plasmaSwitchboard.PROCESSCASE_4_PROXY_LOAD;
            } else if ((initiator != null) && (initiator.equals(yacyCore.seedDB.mySeed().hash))) {
                // normal crawling
                processCase = plasmaSwitchboard.PROCESSCASE_5_LOCAL_CRAWLING;
            } else {
                // this was done for remote peer (a global crawl)
                processCase = plasmaSwitchboard.PROCESSCASE_6_GLOBAL_CRAWLING;
            }
            return processCase;
        }
        /**
         * decide upon header information if a specific file should be indexed
         * this method returns null if the answer is 'YES'!