refactoring of parsing-condensing-indexing process:

- separated parts - removed storagePeer function next step will be parallelization of processes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4600 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 8d6a13bc07
parent d3b06913ec
commit 8d6a13bc07
4 changed files with 244 additions and 347 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -767,9 +767,6 @@ CRDist1Method  = 9
 CRDist1Percent = 30
 CRDist1Target  = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080

-# Hash of the peer, you would like to store to the data your installation collected.
-storagePeerHash =
-
 # Search sequence settings
 # collection:
 # time = time to get a RWI out of RAM cache, assortments and WORDS files
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -67,17 +67,17 @@ import de.anomic.plasma.parser.Parser;

 public class plasmaParserDocument {
    
-    private yacyURL source;       // the source url
-    private String mimeType;        // mimeType as taken from http header
-    private String charset;         // the charset of the document
-    private List<String> keywords;  // most resources provide a keyword field
-    private StringBuffer title;     // a document title, taken from title or h1 tag; shall appear as headline of search result
-    private StringBuffer creator;    // author or copyright
-    private List<String> sections;  // if present: more titles/headlines appearing in the document
-    private StringBuffer description;   // an abstract, if present: short content description
-    private Object text;            // the clear text, all that is visible
-    private Map<yacyURL, String> anchors;    // all links embedded as clickeable entities (anchor tags)
-    private HashMap<String, htmlFilterImageEntry> images;         // all visible pictures in document
+    private yacyURL source;               // the source url
+    private String mimeType;              // mimeType as taken from http header
+    private String charset;               // the charset of the document
+    private List<String> keywords;        // most resources provide a keyword field
+    private StringBuffer title;           // a document title, taken from title or h1 tag; shall appear as headline of search result
+    private StringBuffer creator;         // author or copyright
+    private List<String> sections;        // if present: more titles/headlines appearing in the document
+    private StringBuffer description;     // an abstract, if present: short content description
+    private Object text;                  // the clear text, all that is visible
+    private Map<yacyURL, String> anchors; // all links embedded as clickeable entities (anchor tags)
+    private HashMap<String, htmlFilterImageEntry> images; // all visible pictures in document
    // the anchors and images - Maps are URL-to-EntityDescription mappings.
    // The EntityDescription appear either as visible text in anchors or as alternative
    // text in image tags.
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -117,14 +117,11 @@ import de.anomic.data.messageBoard;
 import de.anomic.data.userDB;
 import de.anomic.data.wikiBoard;
 import de.anomic.data.wiki.wikiParser;
-import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.http.httpHeader;
 import de.anomic.http.httpRemoteProxyConfig;
 import de.anomic.http.httpc;
 import de.anomic.http.httpd;
 import de.anomic.http.httpdRobotsTxtConfig;
-import de.anomic.index.indexContainer;
-import de.anomic.index.indexRWIRowEntry;
 import de.anomic.index.indexURLEntry;
 import de.anomic.kelondro.kelondroBitfield;
 import de.anomic.kelondro.kelondroCache;
@ -137,7 +134,6 @@ import de.anomic.plasma.crawler.plasmaCrawlQueues;
 import de.anomic.plasma.crawler.plasmaProtocolLoader;
 import de.anomic.plasma.dbImport.dbImportManager;
 import de.anomic.plasma.parser.ParserException;
-import de.anomic.plasma.plasmaCondenser.wordStatProp;
 import de.anomic.plasma.urlPattern.defaultURLPattern;
 import de.anomic.plasma.urlPattern.plasmaURLPattern;
 import de.anomic.server.serverAbstractSwitch;
@ -175,9 +171,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
    // 5) local prefetch/crawling (initiator is own seedHash)
    // 6) local fetching for global crawling (other known or unknown initiator)
    public static final int PROCESSCASE_0_UNKNOWN = 0;
-    public static final int PROCESSCASE_1_GLOBAL_CRAWLING = 1;
-    public static final int PROCESSCASE_2_SEARCH_QUERY_RESULT = 2;
-    public static final int PROCESSCASE_3_INDEX_TRANSFER_RESULT = 3;
    public static final int PROCESSCASE_4_PROXY_LOAD = 4;
    public static final int PROCESSCASE_5_LOCAL_CRAWLING = 5;
    public static final int PROCESSCASE_6_GLOBAL_CRAWLING = 6;
@ -651,16 +644,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
     * <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
     */
    public static final String CRAWLER_THREADS_ACTIVE_MAX       = "crawler.MaxActiveThreads";
-    
+
    public static final String OWN_SEED_FILE                    = "yacyOwnSeedFile";
-    /**
-     * <p><code>public static final String <strong>STORAGE_PEER_HASH</strong> = "storagePeerHash"</code></p>
-     * <p>Name of the setting holding the Peer-Hash where indexes shall be transferred after indexing a webpage. If this setting
-     * is empty, the Storage Peer function is disabled</p>
-     */
-    public static final String STORAGE_PEER_HASH                = "storagePeerHash";
    public static final String YACY_MODE_DEBUG                  = "yacyDebugMode";
-    
    public static final String WORDCACHE_INIT_COUNT             = "wordCacheInitCount";
    /**
     * <p><code>public static final String <strong>WORDCACHE_MAX_COUNT</strong> = "wordCacheMaxCount"</code></p>
@ -1825,15 +1811,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            }

            // generate a dht chunk
-            if (
-                    (dhtShallTransfer() == null) &&
-                    (
-                            (this.dhtTransferChunk == null) ||
-                            (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_UNDEFINED) ||
-                            // (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE) ||
-                            (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_FAILED)
-                    )
-            ) {
+            if ((dhtShallTransfer() == null) && (
+                    (this.dhtTransferChunk == null) ||
+                    (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_UNDEFINED) ||
+                    // (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE) ||
+                    (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_FAILED)
+               )) {
                // generate new chunk
                int minChunkSize = (int) getConfigLong(INDEX_DIST_CHUNK_SIZE_MIN, 30);
                dhtTransferChunk = new plasmaDHTChunk(this.log, wordIndex, minChunkSize, dhtTransferIndexCount, 5000);
@ -1867,7 +1850,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            }

            // parse and index the resource
-            processResourceStack(nextentry);
+            plasmaParserDocument document = parseDocument(nextentry);
+            if (document != null) {
+                plasmaCondenser condensement = condenseDocument(nextentry, document);
+                if (condensement != null) {
+                    indexDocument(nextentry, document, condensement);
+                }
+            }
            return true;
        } catch (InterruptedException e) {
            log.logInfo("DEQUEUE: Shutdown detected.");
@ -2078,7 +2067,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
        }
    }
    
-    private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException {
+    private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry) throws InterruptedException, ParserException {
        
        // the mimetype of this entry
        String mimeType = entry.getMimeType();
@ -2097,71 +2086,43 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
        return doc;
    }
    
-    private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
+    private plasmaParserDocument parseDocument(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
        plasmaParserDocument document = null;
        try {
-            // work off one stack entry with a fresh resource
            long stackStartTime = 0, stackEndTime = 0,
-            parsingStartTime = 0, parsingEndTime = 0,
-            indexingStartTime = 0, indexingEndTime = 0,
-            storageStartTime = 0, storageEndTime = 0;
-            
-            // we must distinguish the following cases: resource-load was initiated by
-            // 1) global crawling: the index is extern, not here (not possible here)
-            // 2) result of search queries, some indexes are here (not possible here)
-            // 3) result of index transfer, some of them are here (not possible here)
-            // 4) proxy-load (initiator is "------------")
-            // 5) local prefetch/crawling (initiator is own seedHash)
-            // 6) local fetching for global crawling (other known or unknwon initiator)
-            int processCase = PROCESSCASE_0_UNKNOWN;
-            yacySeed initiatorPeer = null;
-            String initiatorPeerHash = (entry.proxy()) ? yacyURL.dummyHash : entry.initiator();
-            if (initiatorPeerHash.equals(yacyURL.dummyHash)) {
-                // proxy-load
-                processCase = PROCESSCASE_4_PROXY_LOAD;
-            } else if (initiatorPeerHash.equals(yacyCore.seedDB.mySeed().hash)) {
-                // normal crawling
-                processCase = PROCESSCASE_5_LOCAL_CRAWLING;
-            } else {
-                // this was done for remote peer (a global crawl)
-                initiatorPeer = yacyCore.seedDB.getConnected(initiatorPeerHash);
-                processCase = PROCESSCASE_6_GLOBAL_CRAWLING;
-            }
+            parsingStartTime = 0, parsingEndTime = 0;
+            int processCase = entry.processCase();
            
            log.logFine("processResourceStack processCase=" + processCase +
                    ", depth=" + entry.depth() +
                    ", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().generalDepth())) +
                    ", filter=" + ((entry.profile() == null) ? "null" : entry.profile().generalFilter()) +
-                    ", initiatorHash=" + initiatorPeerHash +
+                    ", initiatorHash=" + entry.initiator() +
                    //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
                    ", url=" + entry.url()); // DEBUG
            
-            /* =========================================================================
-             * PARSE CONTENT
-             * ========================================================================= */
+            // PARSE CONTENT
            parsingStartTime = System.currentTimeMillis();

            try {
-                document = this.parseResource(entry, initiatorPeerHash);
-                if (document == null) return;
+                document = this.parseResource(entry);
+                if (document == null) return null;
            } catch (ParserException e) {
                this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
-                addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new kelondroBitfield());
+                addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.anchorName(), e.getErrorCode(), new kelondroBitfield());
                if (document != null) {
                    document.close();
                    document = null;
                }
-                return;
+                return null;
            }
            
            parsingEndTime = System.currentTimeMillis();            
            
-            // getting the document date
+            // get the document date
            Date docDate = entry.getModificationDate();
            
-            /* =========================================================================
-             * put anchors on crawl stack
-             * ========================================================================= */            
+            // put anchors on crawl stack
            stackStartTime = System.currentTimeMillis();
            if (
                    ((processCase == PROCESSCASE_4_PROXY_LOAD) || (processCase == PROCESSCASE_5_LOCAL_CRAWLING)) &&
@ -2179,268 +2140,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
                    nextEntry = i.next();
                    nextUrl = nextEntry.getKey();
                    // enqueue the hyperlink into the pre-notice-url db
-                    crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
+                    crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), entry.initiator(), nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
                }
-                log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
-                        ", NEW CRAWL STACK SIZE IS " + crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
+                if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
+                        ", NEW CRAWL STACK SIZE IS " + crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
+                        ", STACKING TIME = " + (stackEndTime-stackStartTime) +
+                        ", PARSING TIME = " + (parsingEndTime-parsingStartTime));
            }
            stackEndTime = System.currentTimeMillis();
-            
-            /* =========================================================================
-             * CREATE INDEX
-             * ========================================================================= */  
-            String dc_title = document.dc_title();
-            yacyURL referrerURL = entry.referrerURL();
-
-            String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR;
-            if (processCase == PROCESSCASE_4_PROXY_LOAD) {
-                // proxy-load
-                noIndexReason = entry.shallIndexCacheForProxy();
-            } else {
-                // normal crawling
-                noIndexReason = entry.shallIndexCacheForCrawler();
-            }
-            
-            if (noIndexReason == null) {
-                // strip out words
-                indexingStartTime = System.currentTimeMillis();
-                
-                checkInterruption();
-                log.logFine("Condensing for '" + entry.url().toNormalform(false, true) + "'");
-                plasmaCondenser condenser = new plasmaCondenser(document, entry.profile().indexText(), entry.profile().indexMedia());
-                
-                // generate citation reference
-                Integer[] ioLinks = webStructure.generateCitationReference(entry.url(), entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
-                
-                try {        
-                    // check for interruption
-                    checkInterruption();
-                    
-                    // create a new loaded URL db entry
-                    long ldate = System.currentTimeMillis();
-                    indexURLEntry newEntry = new indexURLEntry(
-                            entry.url(),                               // URL
-                            dc_title,                            // document description
-                            document.dc_creator(),                     // author
-                            document.dc_subject(' '),                  // tags
-                            "",                                        // ETag
-                            docDate,                                   // modification date
-                            new Date(),                                // loaded date
-                            new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula 
-                            (referrerURL == null) ? null : referrerURL.hash(),                        // referer hash
-                            new byte[0],                               // md5
-                            (int) entry.size(),                        // size
-                            condenser.RESULT_NUMB_WORDS,               // word count
-                            plasmaHTCache.docType(document.dc_format()), // doctype
-                            condenser.RESULT_FLAGS,                    // flags
-                            yacyURL.language(entry.url()),             // language
-                            ioLinks[0].intValue(),                     // llocal
-                            ioLinks[1].intValue(),                     // lother
-                            document.getAudiolinks().size(),           // laudio
-                            document.getImages().size(),               // limage
-                            document.getVideolinks().size(),           // lvideo
-                            document.getApplinks().size()              // lapp
-                    );
-                    /* ========================================================================
-                     * STORE URL TO LOADED-URL-DB
-                     * ======================================================================== */
-                    wordIndex.loadedURL.store(newEntry);
-                    wordIndex.loadedURL.stack(
-                            newEntry,                       // loaded url db entry
-                            initiatorPeerHash,              // initiator peer hash
-                            yacyCore.seedDB.mySeed().hash,    // executor peer hash
-                            processCase                     // process case
-                    );                    
-                    
-                    // check for interruption
-                    checkInterruption();
-                    
-                    /* ========================================================================
-                     * STORE WORD INDEX
-                     * ======================================================================== */
-                    if (
-                            (
-                                    (processCase == PROCESSCASE_4_PROXY_LOAD) || 
-                                    (processCase == PROCESSCASE_5_LOCAL_CRAWLING) || 
-                                    (processCase == PROCESSCASE_6_GLOBAL_CRAWLING)
-                            ) && 
-                            ((entry.profile().indexText()) || (entry.profile().indexMedia()))
-                    ) {
-                        String urlHash = newEntry.hash();
-                        
-                        // remove stopwords                        
-                        log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
-                        indexingEndTime = System.currentTimeMillis();
-                        
-                        storageStartTime = System.currentTimeMillis();
-                        int words = 0;
-                        String storagePeerHash;
-                        yacySeed seed;
-                                                
-                        if (
-                                ((storagePeerHash = getConfig(STORAGE_PEER_HASH, null)) == null) ||
-                                (storagePeerHash.trim().length() == 0) ||
-                                ((seed = yacyCore.seedDB.getConnected(storagePeerHash)) == null)
-                        ){
-                            
-                            /* ========================================================================
-                             * STORE PAGE INDEX INTO WORD INDEX DB
-                             * ======================================================================== */
-                            words = wordIndex.addPageIndex(
-                                    entry.url(),                                  // document url
-                                    docDate,                                      // document mod date
-                                    (int) entry.size(),                           // document size
-                                    document,                                     // document content
-                                    condenser,                                    // document condenser
-                                    yacyURL.language(entry.url()),                // document language
-                                    plasmaHTCache.docType(document.dc_format()),// document type
-                                    ioLinks[0].intValue(),                        // outlinkSame
-                                    ioLinks[1].intValue()                         // outlinkOthers
-                            );
-                        } else {
-                            /* ========================================================================
-                             * SEND PAGE INDEX TO STORAGE PEER
-                             * ======================================================================== */                            
-                            HashMap<String, indexURLEntry> urlCache = new HashMap<String, indexURLEntry>(1);
-                            urlCache.put(newEntry.hash(), newEntry);
-                            
-                            ArrayList<indexContainer> tmpContainers = new ArrayList<indexContainer>(condenser.words().size());
-                            
-                            String language = yacyURL.language(entry.url());                            
-                            char doctype = plasmaHTCache.docType(document.dc_format());
-                            indexURLEntry.Components comp = newEntry.comp();
-                            int urlLength = comp.url().toNormalform(true, true).length();
-                            int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform(true, true)).length;
-
-                            // iterate over all words
-                            Iterator<Map.Entry<String, wordStatProp>> i = condenser.words().entrySet().iterator();
-                            Map.Entry<String, wordStatProp> wentry;
-                            plasmaCondenser.wordStatProp wordStat;
-                            while (i.hasNext()) {
-                                wentry = i.next();
-                                String word = wentry.getKey();
-                                wordStat = wentry.getValue();
-                                String wordHash = plasmaCondenser.word2hash(word);
-                                indexRWIRowEntry wordIdxEntry = new indexRWIRowEntry(
-                                            urlHash,
-                                            urlLength, urlComps,
-                                            wordStat.count,
-                                            document.dc_title().length(),
-                                            condenser.words().size(),
-                                            condenser.sentences().size(),
-                                            wordStat.posInText,
-                                            wordStat.posInPhrase,
-                                            wordStat.numOfPhrase,
-                                            docDate.getTime(),
-                                            System.currentTimeMillis(),
-                                            language,
-                                            doctype,
-                                            ioLinks[0].intValue(),
-                                            ioLinks[1].intValue(),
-                                            condenser.RESULT_FLAGS
-                                        );
-                                indexContainer wordIdxContainer = plasmaWordIndex.emptyContainer(wordHash, 1);
-                                wordIdxContainer.add(wordIdxEntry);
-                                tmpContainers.add(wordIdxContainer);
-                            }
-                            //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
-                            words = condenser.words().size();
-                            
-                            // transfering the index to the storage peer
-                            indexContainer[] indexData = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]);
-                            HashMap<String, Object> resultObj = yacyClient.transferIndex(
-                                    seed,       // target seed
-                                    indexData,  // word index data
-                                    urlCache,   // urls
-                                    true,       // gzip body
-                                    120000      // transfer timeout 
-                            );
-                            
-                            // check for interruption
-                            checkInterruption();
-                            
-                            // if the transfer failed we try to store the index locally
-                            String error = (String) resultObj.get("result");
-                            if (error != null) {
-                                words = wordIndex.addPageIndex(
-                                        entry.url(),
-                                        docDate, 
-                                        (int) entry.size(),
-                                        document, 
-                                        condenser,
-                                        yacyURL.language(entry.url()),
-                                        plasmaHTCache.docType(document.dc_format()),
-                                        ioLinks[0].intValue(), 
-                                        ioLinks[1].intValue()
-                                );
-                            }
-                            
-                            tmpContainers = null;
-                        } //end: SEND PAGE INDEX TO STORAGE PEER
-                        
-                        storageEndTime = System.currentTimeMillis();
-                        
-                        //increment number of indexed urls
-                		indexedPages++;
-                        
-                        if (log.isInfo()) {
-                            // TODO: UTF-8 docDescription seems not to be displayed correctly because
-                            // of string concatenation
-                            log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
-                                    " [" + entry.urlHash() + "]" +
-                                    "\n\tDescription:  " + dc_title +
-                                    "\n\tMimeType: "  + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
-                                    "Size: " + document.getTextLength() + " bytes | " +
-                                    "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
-                                    "\n\tStackingTime:  " + (stackEndTime-stackStartTime) + " ms | " +
-                                    "ParsingTime:  " + (parsingEndTime-parsingStartTime) + " ms | " +
-                                    "IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
-                                    "StorageTime: " + (storageEndTime-storageStartTime) + " ms");
-                        }
-
-                        // update profiling info
-                        plasmaProfiling.updateIndexedPage(entry);
-                        
-                        // check for interruption
-                        checkInterruption();
-                        
-                        // if this was performed for a remote crawl request, notify requester
-                        if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
-                            log.logInfo("Sending crawl receipt for '" + entry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
-                            if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
-                            yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
-                        }
-                    } else {
-                        log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
-                        addURLtoErrorDB(entry.url(), referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
-                    }
-                } catch (Exception ee) {
-                    if (ee instanceof InterruptedException) throw (InterruptedException)ee;
-                    
-                    // check for interruption
-                    checkInterruption();
-                    
-                    log.logSevere("Could not index URL " + entry.url() + ": " + ee.getMessage(), ee);
-                    if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
-                        if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
-                        yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, "");
-                    }
-                    addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new kelondroBitfield());
-                }
-                
-            } else {
-                // check for interruption
-                checkInterruption();
-                
-                log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
-                addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, noIndexReason, new kelondroBitfield());
-                if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
-                    if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
-                    yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
-                }
-            }
-            document.close();
-            document = null;
        } catch (Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException)e;
            this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
@ -2456,14 +2163,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            synchronized (this.indexingTasksInProcess) {
                this.indexingTasksInProcess.remove(entry.urlHash());
            }
-
-            // removing current entry from notice URL queue
-            /*
-            boolean removed = noticeURL.remove(entry.urlHash()); // worked-off
-            if (!removed) {
-                log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect.");
-            }
-            */
            
            // explicit delete/free resources
            if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) {
@ -2472,7 +2171,173 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            }
            entry = null;
            
-            if (document != null) try { document.close(); } catch (Exception e) { /* ignore this */ }
+            if (document != null) try { document.close(); } catch (Exception e) {}
+        }
+        return document;
+    }
+    
+    private plasmaCondenser condenseDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document) throws InterruptedException {
+        // CREATE INDEX
+        String dc_title = document.dc_title();
+        yacyURL referrerURL = entry.referrerURL();
+        int processCase = entry.processCase();
+        
+        String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR;
+        if (processCase == PROCESSCASE_4_PROXY_LOAD) {
+            // proxy-load
+            noIndexReason = entry.shallIndexCacheForProxy();
+        } else {
+            // normal crawling
+            noIndexReason = entry.shallIndexCacheForCrawler();
+        }
+        
+        if (noIndexReason != null) {
+            // check for interruption
+            checkInterruption();
+            
+            log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
+            addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), entry.initiator(), dc_title, noIndexReason, new kelondroBitfield());
+            /*
+            if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
+                if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
+                yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
+            }
+            */
+            document.close();
+            document = null;
+            return null;
+        }
+    
+        // strip out words
+        checkInterruption();
+        log.logFine("Condensing for '" + entry.url().toNormalform(false, true) + "'");
+        plasmaCondenser condenser;
+        try {
+            condenser = new plasmaCondenser(document, entry.profile().indexText(), entry.profile().indexMedia());
+        } catch (UnsupportedEncodingException e) {
+            return null;
+        }
+        return condenser;
+    }
+    
+    private void indexDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws InterruptedException {
+        long indexingStartTime = 0, indexingEndTime = 0,
+        storageStartTime = 0, storageEndTime = 0;
+
+        // CREATE INDEX
+        String dc_title = document.dc_title();
+        yacyURL referrerURL = entry.referrerURL();
+        Date docDate = entry.getModificationDate();
+        int processCase = entry.processCase();
+
+        // generate citation reference
+        Integer[] ioLinks = webStructure.generateCitationReference(entry.url(), entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
+           
+        // check for interruption
+        checkInterruption();
+        
+        // create a new loaded URL db entry
+        long ldate = System.currentTimeMillis();
+        indexURLEntry newEntry = new indexURLEntry(
+                entry.url(),                               // URL
+                dc_title,                                  // document description
+                document.dc_creator(),                     // author
+                document.dc_subject(' '),                  // tags
+                "",                                        // ETag
+                docDate,                                   // modification date
+                new Date(),                                // loaded date
+                new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula 
+                (referrerURL == null) ? null : referrerURL.hash(),            // referer hash
+                new byte[0],                               // md5
+                (int) entry.size(),                        // size
+                condenser.RESULT_NUMB_WORDS,               // word count
+                plasmaHTCache.docType(document.dc_format()), // doctype
+                condenser.RESULT_FLAGS,                    // flags
+                yacyURL.language(entry.url()),             // language
+                ioLinks[0].intValue(),                     // llocal
+                ioLinks[1].intValue(),                     // lother
+                document.getAudiolinks().size(),           // laudio
+                document.getImages().size(),               // limage
+                document.getVideolinks().size(),           // lvideo
+                document.getApplinks().size()              // lapp
+        );
+        
+        // STORE URL TO LOADED-URL-DB
+        try {
+            wordIndex.loadedURL.store(newEntry);
+        } catch (IOException e) {
+            log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
+            addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, "error storing url: " + e.getMessage(), new kelondroBitfield());
+            return;
+        }
+        
+        wordIndex.loadedURL.stack(
+                newEntry,                      // loaded url db entry
+                entry.initiator(),             // initiator peer hash
+                yacyCore.seedDB.mySeed().hash, // executor peer hash
+                processCase                    // process case
+        );                    
+        
+        // check for interruption
+        checkInterruption();
+        
+        // STORE WORD INDEX
+        if ((!entry.profile().indexText()) && (!entry.profile().indexMedia())) {
+            log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
+            addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
+            return;
+        }
+            
+        // remove stopwords                        
+        log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
+        indexingEndTime = System.currentTimeMillis();
+        
+        storageStartTime = System.currentTimeMillis();
+        int words = 0;
+             
+        // STORE PAGE INDEX INTO WORD INDEX DB
+        words = wordIndex.addPageIndex(
+                entry.url(),                                  // document url
+                docDate,                                      // document mod date
+                (int) entry.size(),                           // document size
+                document,                                     // document content
+                condenser,                                    // document condenser
+                yacyURL.language(entry.url()),                // document language
+                plasmaHTCache.docType(document.dc_format()),// document type
+                ioLinks[0].intValue(),                        // outlinkSame
+                ioLinks[1].intValue()                         // outlinkOthers
+        );
+            
+        storageEndTime = System.currentTimeMillis();
+        
+        //increment number of indexed urls
+        indexedPages++;
+        
+        if (log.isInfo()) {
+            // TODO: UTF-8 docDescription seems not to be displayed correctly because
+            // of string concatenation
+            log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
+                    " [" + entry.urlHash() + "]" +
+                    "\n\tDescription:  " + dc_title +
+                    "\n\tMimeType: "  + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
+                    "Size: " + document.getTextLength() + " bytes | " +
+                    "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
+                    "\n\tIndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
+                    "StorageTime: " + (storageEndTime-storageStartTime) + " ms");
+        }
+
+        // update profiling info
+        plasmaProfiling.updateIndexedPage(entry);
+        
+        // check for interruption
+        checkInterruption();
+        yacySeed initiatorPeer = entry.initiatorPeer();
+        
+        // if this was performed for a remote crawl request, notify requester
+        if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
+            log.logInfo("Sending crawl receipt for '" + entry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
+            if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
+            yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
        }
    }
    
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@ -57,6 +57,8 @@ import de.anomic.kelondro.kelondroRow;
 import de.anomic.kelondro.kelondroStack;
 import de.anomic.plasma.cache.IResourceInfo;
 import de.anomic.server.logging.serverLog;
+import de.anomic.yacy.yacyCore;
+import de.anomic.yacy.yacySeed;
 import de.anomic.yacy.yacySeedDB;
 import de.anomic.yacy.yacyURL;

@ -286,6 +288,17 @@ public class plasmaSwitchboardQueue {
        public String initiator() {
            return initiator;
        }
+        
+        public yacySeed initiatorPeer() {
+            if ((initiator == null) || (initiator.equals(yacyURL.dummyHash))) return null;
+            if (initiator.equals(yacyCore.seedDB.mySeed().hash)) {
+                // normal crawling
+                return null;
+            } else {
+                // this was done for remote peer (a global crawl)
+                return yacyCore.seedDB.getConnected(initiator);
+            }
+        }

        public int depth() {
            return depth;
@ -342,6 +355,28 @@ public class plasmaSwitchboardQueue {
            return anchorName;
        }

+        public int processCase() {
+            // we must distinguish the following cases: resource-load was initiated by
+            // 1) global crawling: the index is extern, not here (not possible here)
+            // 2) result of search queries, some indexes are here (not possible here)
+            // 3) result of index transfer, some of them are here (not possible here)
+            // 4) proxy-load (initiator is "------------")
+            // 5) local prefetch/crawling (initiator is own seedHash)
+            // 6) local fetching for global crawling (other known or unknwon initiator)
+            int processCase = plasmaSwitchboard.PROCESSCASE_0_UNKNOWN;
+            if ((initiator == null) || (initiator.equals(yacyURL.dummyHash))) {
+                // proxy-load
+                processCase = plasmaSwitchboard.PROCESSCASE_4_PROXY_LOAD;
+            } else if ((initiator != null) && (initiator.equals(yacyCore.seedDB.mySeed().hash))) {
+                // normal crawling
+                processCase = plasmaSwitchboard.PROCESSCASE_5_LOCAL_CRAWLING;
+            } else {
+                // this was done for remote peer (a global crawl)
+                processCase = plasmaSwitchboard.PROCESSCASE_6_GLOBAL_CRAWLING;
+            }
+            return processCase;
+        }
+        
        /**
         * decide upon header information if a specific file should be indexed
         * this method returns null if the answer is 'YES'!