- refactoring of CrawlEntry and CrawlStacker

- introduced blocking queues in CrawlStacker to make it ready for concurrency - added a second busy thread for the CrawlStacker The CrawlStacker is multithreaded. It shall be transformed into a BlockingThread in another step. The concurrency of the stacker will hopefully solve some problems with cases where DNS blocks. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5395 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 7535fd7447
parent 6569cbbec1
commit 7535fd7447
19 changed files with 219 additions and 292 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -568,9 +568,12 @@ performanceSpeed=100
 80_indexing_idlesleep=1000
 80_indexing_busysleep=10
 80_indexing_memprereq=6291456
-82_crawlstack_idlesleep=5000
+82_crawlstack_idlesleep=1000
 82_crawlstack_busysleep=0
 82_crawlstack_memprereq=1048576
+83_crawlstack_idlesleep=1200
+83_crawlstack_busysleep=0
+83_crawlstack_memprereq=1048576
 90_cleanup_idlesleep=300000
 90_cleanup_busysleep=300000
 90_cleanup_memprereq=0
@ -818,12 +821,6 @@ svnRevision=0

 currentSkin=default

-# temporary flag for new database structure. set only true for testing
-# ALL DATA THAT IS CREATED WITH THIS FLAG ON WILL BE VOID IN A FINAL VERSION
-# table-types: RAM = 0, TREE = 1, FLEX = 2;
-# if you set this to a non-RAM value, you should increase the stacker.slots value
-tableTypeForPreNURL=0
-
 # flag to show if pages shall be usable for non-admin users
 # this can be applied to the Surftips.html and yacysearch.html page
 publicSurftips = true
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@ -109,11 +109,7 @@ public class IndexControlRWIs_p {
            if (post.containsKey("deletecomplete") && post.containsKey("confirmDelete")) {
                sb.webIndex.clear();
                sb.crawlQueues.clear();
-                try {
-                    sb.crawlStacker.clear();
-                } catch (final IOException e) {
-                    e.printStackTrace();
-                }
+                sb.crawlStacker.clear();
                try {
                    sb.robots.clear();
                } catch (final IOException e) {
--- a/htroot/PeerLoadPicture.java
+++ b/htroot/PeerLoadPicture.java
@ -29,7 +29,8 @@ public class PeerLoadPicture {
        final CircleThreadPiece misc = new CircleThreadPiece("Misc.", new Color(190,  50, 180));
        final HashMap<String, CircleThreadPiece> pieces = new HashMap<String, CircleThreadPiece>();
        pieces.put(null, idle);
-        pieces.put(plasmaSwitchboardConstants.CRAWLSTACK, new CircleThreadPiece("Stacking",         new Color(115, 200, 210)));
+        pieces.put(plasmaSwitchboardConstants.CRAWLSTACK0, new CircleThreadPiece("Stacking0",         new Color(115, 200, 210)));
+        pieces.put(plasmaSwitchboardConstants.CRAWLSTACK1, new CircleThreadPiece("Stacking1",         new Color(115, 200, 210)));
        pieces.put(plasmaSwitchboardConstants.INDEXER,    new CircleThreadPiece("Parsing/Indexing", new Color(255, 130,   0)));
        pieces.put(plasmaSwitchboardConstants.INDEX_DIST, new CircleThreadPiece("DHT-Distribution", new Color(119, 136, 153)));
        pieces.put(plasmaSwitchboardConstants.PEER_PING,  new CircleThreadPiece("YaCy Core",        new Color(255, 230, 160)));
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -34,6 +34,7 @@ import java.net.MalformedURLException;
 import java.net.URLDecoder;
 import java.util.Date;

+import de.anomic.crawler.CrawlEntry;
 import de.anomic.crawler.CrawlProfile;
 import de.anomic.http.httpRequestHeader;
 import de.anomic.plasma.plasmaSwitchboard;
@ -158,15 +159,18 @@ public class QuickCrawlLink_p {
            
            // stack URL
            String reasonString = null;
-            reasonString = sb.crawlStacker.stackCrawl(
-                        crawlingStartURL, 
-                        null, 
-                        sb.webIndex.seedDB.mySeed().hash, 
-                        (title==null)?"CRAWLING-ROOT":title, 
-                                new Date(), 
-                                0, 
-                                pe
-                );
+            reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
+                    sb.webIndex.seedDB.mySeed().hash, 
+                    crawlingStartURL,
+                    null, 
+                    (title==null)?"CRAWLING-ROOT":title, 
+                    new Date(),
+                    null, 
+                    pe.handle(),
+                    0, 
+                    0,
+                    0
+                ));
            
            // validate rejection reason
            if (reasonString == null) {
--- a/htroot/WatchCrawler_p.java
+++ b/htroot/WatchCrawler_p.java
@ -211,7 +211,18 @@ public class WatchCrawler_p {
                                    crawlingQ,
                                    indexText, indexMedia,
                                    storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
-                            final String reasonString = sb.crawlStacker.stackCrawl(url, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
+                            final String reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
+                                    sb.webIndex.seedDB.mySeed().hash,
+                                    url,
+                                    null,
+                                    "CRAWLING-ROOT",
+                                    new Date(),
+                                    null,
+                                    pe.handle(),
+                                    0,
+                                    0,
+                                    0
+                                    ));
                            
                            if (reasonString == null) {
                            	// create a bookmark from crawl start url
@ -260,6 +271,7 @@ public class WatchCrawler_p {
                                                "", 
                                                "", 
                                                new Date(),
+                                                null,
                                                pe.handle(),
                                                0, 
                                                0, 
@ -338,14 +350,18 @@ public class WatchCrawler_p {
                                    if (nexturl == null) continue;
                                    
                                    // enqueuing the url for crawling
-                                    sb.crawlStacker.enqueueEntry(
+                                    sb.crawlStacker.enqueueEntry(new CrawlEntry(
+                                            sb.webIndex.seedDB.mySeed().hash, 
                                            nexturl, 
                                            "", 
-                                            sb.webIndex.seedDB.mySeed().hash, 
                                            e.getValue(), 
-                                            new Date(), 
-                                            0, 
-                                            profile);
+                                            new Date(),
+                                            null,
+                                            profile.handle(),
+                                            0,
+                                            0,
+                                            0
+                                            ));
                                }
                               
                            } catch (final PatternSyntaxException e) {
--- a/htroot/rct_p.java
+++ b/htroot/rct_p.java
@ -30,6 +30,7 @@ import java.text.ParseException;
 import java.util.Date;
 import java.util.Iterator;

+import de.anomic.crawler.CrawlEntry;
 import de.anomic.http.httpRequestHeader;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.server.serverDate;
@ -76,7 +77,18 @@ public class rct_p {
                        if (urlRejectReason == null) {
                            // stack url
                            if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
-                            sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
+                            sb.crawlStacker.enqueueEntry(new CrawlEntry(
+                                    peerhash,
+                                    url,
+                                    (referrer == null) ? null : referrer.hash(),
+                                    "REMOTE-CRAWLING",
+                                    null,
+                                    loaddate,
+                                    sb.webIndex.defaultRemoteProfile.handle(),
+                                    0,
+                                    0,
+                                    0
+                                    ));
                        } else {
                            env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
                        }
--- a/source/de/anomic/crawler/CrawlEntry.java
+++ b/source/de/anomic/crawler/CrawlEntry.java
@ -36,10 +36,11 @@ import de.anomic.kelondro.kelondroBitfield;
 import de.anomic.kelondro.kelondroNaturalOrder;
 import de.anomic.kelondro.kelondroRow;
 import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverProcessorJob;
 import de.anomic.yacy.yacySeedDB;
 import de.anomic.yacy.yacyURL;

-public class CrawlEntry {
+public class CrawlEntry extends serverProcessorJob {
    
    // row definition for balancer-related NURL-entries
    public final static kelondroRow rowdef = new kelondroRow(
@ -80,7 +81,7 @@ public class CrawlEntry {
    private int      forkfactor;    // sum of anchors of all ancestors
    private kelondroBitfield flags;
    private int      handle;
-    private String   status;
+    private String   statusMessage;
    private int      initialHash;   // to provide a object hash that does not change even if the url changes because of redirection
    
    public static class domaccess {
@ -116,38 +117,38 @@ public class CrawlEntry {
     * @param forkfactor sum of anchors of all ancestors
     */
    public CrawlEntry(
-                 final String initiator, 
-                 final yacyURL url, 
-                 final String referrerhash, 
-                 final String name, 
-                 final Date appdate,
-                 final String profileHandle,
-                 final int depth, 
-                 final int anchors, 
-                 final int forkfactor
+            final String initiator, 
+            final yacyURL url, 
+            final String referrerhash, 
+            final String name, 
+            final Date appdate,
+            final Date loaddate,
+            final String profileHandle,
+            final int depth, 
+            final int anchors, 
+            final int forkfactor
    ) {
        // create new entry and store it into database
-        assert appdate != null;
        assert url != null;
        assert initiator != null;
-        assert referrerhash != null;
        assert profileHandle.length() == yacySeedDB.commonHashLength : profileHandle + " != " + yacySeedDB.commonHashLength;
        this.initiator     = initiator;
        this.url           = url;
-        this.refhash       = referrerhash;
+        this.refhash       = (referrerhash == null) ? "" : referrerhash;
        this.name          = (name == null) ? "" : name;
        this.appdate       = (appdate == null) ? 0 : appdate.getTime();
+        this.loaddate      = (loaddate == null) ? 0 : loaddate.getTime();
        this.profileHandle = profileHandle; // must not be null
        this.depth         = depth;
        this.anchors       = anchors;
        this.forkfactor    = forkfactor;
        this.flags         = new kelondroBitfield(rowdef.width(10));
        this.handle        = 0;
-        this.loaddate      = 0;
        this.serverdate    = 0;
        this.imsdate       = 0;
-        this.status        = "loaded(args)";
+        this.statusMessage = "loaded(args)";
        this.initialHash   = url.hashCode();
+        this.status        = serverProcessorJob.STATUS_INITIATED;
    }
    
    public CrawlEntry(final kelondroRow.Entry entry) throws IOException {
@ -172,7 +173,7 @@ public class CrawlEntry {
        this.loaddate = entry.getColLong(12);
        this.serverdate = entry.getColLong(13);
        this.imsdate = entry.getColLong(14);
-        this.status        = "loaded(kelondroRow.Entry)";
+        this.statusMessage        = "loaded(kelondroRow.Entry)";
        this.initialHash   = url.hashCode();
        return;
    }
@ -182,12 +183,13 @@ public class CrawlEntry {
        return this.initialHash;
    }
    
-    public void setStatus(final String s) {
-        this.status = s;
+    public void setStatus(final String s, int code) {
+        this.statusMessage = s;
+        this.status = code;
    }
    
    public String getStatus() {
-        return this.status;
+        return this.statusMessage;
    }
    
    private static String normalizeHandle(final int h) {
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -42,6 +42,7 @@ import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaSwitchboardConstants;
 import de.anomic.server.serverDate;
+import de.anomic.server.serverProcessorJob;
 import de.anomic.server.logging.serverLog;
 import de.anomic.xml.RSSFeed;
 import de.anomic.xml.RSSMessage;
@ -397,7 +398,18 @@ public class CrawlQueues {
            if (urlRejectReason == null) {
                // stack url
                if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
-                sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
+                sb.crawlStacker.enqueueEntry(new CrawlEntry(
+                        hash,
+                        url,
+                        (referrer == null) ? null : referrer.hash(),
+                        item.getDescription(),
+                        null,
+                        loaddate,
+                        sb.webIndex.defaultRemoteProfile.handle(),
+                        0,
+                        0,
+                        0
+                ));
            } else {
                log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
            }
@ -474,6 +486,7 @@ public class CrawlQueues {
                "", 
                "", 
                new Date(),
+                new Date(),
                (forText) ?
                    ((global) ?
                        sb.webIndex.defaultTextSnippetGlobalProfile.handle() :
@ -500,7 +513,7 @@ public class CrawlQueues {
        
        public crawlWorker(final CrawlEntry entry) {
            this.entry = entry;
-            this.entry.setStatus("worker-initialized");
+            this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED);
            this.code = Integer.valueOf(entry.hashCode());
            if (!workers.containsKey(code)) {
                workers.put(code, this);
@ -511,7 +524,7 @@ public class CrawlQueues {
        public void run() {
            try {
                // checking robots.txt for http(s) resources
-                this.entry.setStatus("worker-checkingrobots");
+                this.entry.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED);
                if ((entry.url().getProtocol().equals("http") || entry.url().getProtocol().equals("https")) && sb.robots.isDisallowed(entry.url())) {
                    if (log.isFine()) log.logFine("Crawling of URL '" + entry.url().toString() + "' disallowed by robots.txt.");
                    final ZURL.Entry eentry = errorURL.newEntry(
@ -524,7 +537,7 @@ public class CrawlQueues {
                    errorURL.push(eentry);         
                } else {
                    // starting a load from the internet
-                    this.entry.setStatus("worker-loading");
+                    this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
                    final String result = loader.process(this.entry, plasmaParser.PARSER_MODE_CRAWLER);
                    if (result != null) {
                        final ZURL.Entry eentry = errorURL.newEntry(
@ -536,7 +549,7 @@ public class CrawlQueues {
                        eentry.store();
                        errorURL.push(eentry);
                    } else {
-                        this.entry.setStatus("worker-processed");
+                        this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED);
                    }
                }
            } catch (final Exception e) {
@ -551,7 +564,7 @@ public class CrawlQueues {
                e.printStackTrace();
            } finally {
                workers.remove(code);
-                this.entry.setStatus("worker-finalized");
+                this.entry.setStatus("worker-finalized", serverProcessorJob.STATUS_FINISHED);
            }
        }
        
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -28,17 +28,15 @@

 package de.anomic.crawler;

-import java.io.IOException;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
 import java.util.Date;
-import java.util.LinkedList;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;

 import de.anomic.index.indexReferenceBlacklist;
 import de.anomic.index.indexURLReference;
-import de.anomic.kelondro.kelondroIndex;
-import de.anomic.kelondro.kelondroRow;
-import de.anomic.kelondro.kelondroRowSet;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaWordIndex;
 import de.anomic.server.serverDomains;
@ -49,13 +47,11 @@ public final class CrawlStacker {
    
    final serverLog log = new serverLog("STACKCRAWL");
    
-    private final LinkedList<String> urlEntryHashCache; // the order how this queue is processed; entries with known DNS entries go first
-    private kelondroIndex            urlEntryCache;     // the entries in the queue
-    private long                     dnsHit, dnsMiss;
-    private int                      alternateCount;
-    private CrawlQueues              nextQueue;
-    private plasmaWordIndex          wordIndex;
-    private boolean                  acceptLocalURLs, acceptGlobalURLs;
+    private BlockingQueue<CrawlEntry> fastQueue, slowQueue;
+    private long                      dnsHit, dnsMiss;
+    private CrawlQueues               nextQueue;
+    private plasmaWordIndex           wordIndex;
+    private boolean                   acceptLocalURLs, acceptGlobalURLs;
    
    // objects for the prefetch task
    private final ArrayList<String> dnsfetchHosts = new ArrayList<String>();    
@ -68,26 +64,21 @@ public final class CrawlStacker {
        this.wordIndex = wordIndex;
        this.dnsHit = 0;
        this.dnsMiss = 0;
-        this.alternateCount = 0;
        this.acceptLocalURLs = acceptLocalURLs;
        this.acceptGlobalURLs = acceptGlobalURLs;
        
-        // init the message list
-        this.urlEntryHashCache = new LinkedList<String>();
-
-        this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0);
+        this.fastQueue = new LinkedBlockingQueue<CrawlEntry>();
+        this.slowQueue = new ArrayBlockingQueue<CrawlEntry>(1000);
        this.log.logInfo("STACKCRAWL thread initialized.");
    }

    public int size() {
-        synchronized (this.urlEntryHashCache) {
-            return this.urlEntryHashCache.size();
-        }
+        return this.fastQueue.size() + this.slowQueue.size();
    }

-    public void clear() throws IOException {
-        this.urlEntryHashCache.clear();
-        this.urlEntryCache.clear();
+    public void clear() {
+        this.fastQueue.clear();
+        this.slowQueue.clear();
    }
    
    public void close() {
@ -98,11 +89,7 @@ public final class CrawlStacker {
        
        this.log.logInfo("Shutdown. Closing stackCrawl queue.");

-        // closing the db
-        this.urlEntryCache.close();
-            
-        // clearing the hash list
-        this.urlEntryHashCache.clear();
+        clear();
    }

    private boolean prefetchHost(final String host) {
@ -121,41 +108,17 @@ public final class CrawlStacker {
    }
    
    public boolean job() {
+        if (this.fastQueue.size() > 0 && job(this.fastQueue)) return true;
+        if (this.slowQueue.size() == 0) return false;
+        return job(this.slowQueue);
+    }
+    
+    private boolean job(BlockingQueue<CrawlEntry> queue) {
        // this is the method that is called by the busy thread from outside
-        if (this.urlEntryHashCache.size() == 0) return false;
+        if (queue.size() == 0) return false;
        
        // get the next entry from the queue
-        String urlHash = null;
-        kelondroRow.Entry ec = null;
-        synchronized (this.urlEntryHashCache) {
-            urlHash = this.urlEntryHashCache.removeFirst();
-            if (urlHash == null) {
-                urlEntryHashCache.clear();
-                try {
-                    urlEntryCache.clear();
-                } catch (IOException e) {
-                    e.printStackTrace();
-                }
-                return false;
-            }
-            try {
-                ec = this.urlEntryCache.remove(urlHash.getBytes());
-            } catch (IOException e) {
-                e.printStackTrace();
-                return false;
-            }
-        }
-        if (urlHash == null || ec == null) return false;
-        
-        // make a crawl Entry out of it
-        CrawlEntry entry = null;
-        try {
-            entry = new CrawlEntry(ec);
-        } catch (IOException e1) {
-            e1.printStackTrace();
-            return false;
-        }
-            
+        CrawlEntry entry = queue.poll();
        if (entry == null) return false;

        try {
@ -173,95 +136,30 @@ public final class CrawlStacker {
        }
        return true;
    }
-    
-    public String stackCrawl(
-            final yacyURL url,
-            final String referrerhash,
-            final String initiatorHash,
-            final String name,
-            final Date loadDate,
-            final int currentdepth,
-            final CrawlProfile.entry profile) {
-        // stacks a crawl item. The position can also be remote
-        // returns null if successful, a reason string if not successful
-        //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
-        
-        // add the url into the crawling queue
-        final CrawlEntry entry = new CrawlEntry(
-                initiatorHash,                               // initiator, needed for p2p-feedback
-                url,                                         // url clear text string
-                (referrerhash == null) ? "" : referrerhash,  // last url in crawling queue
-                name,                                        // load date
-                loadDate,                                    // the anchor name
-                (profile == null) ? null : profile.handle(), // profile must not be null!
-                currentdepth,                                // depth so far
-                0,                                           // anchors, default value
-                0                                            // forkfactor, default value
-        );
-        return stackCrawl(entry);
-    }
-    
-    public void enqueueEntry(
-            final yacyURL nexturl, 
-            final String referrerhash, 
-            final String initiatorHash, 
-            final String name, 
-            final Date loadDate, 
-            final int currentdepth, 
-            final CrawlProfile.entry profile) {
-        if (profile == null) return;
-        
+ 
+    public void enqueueEntry(final CrawlEntry entry) {
+     
        // DEBUG
-        if (log.isFinest()) log.logFinest("ENQUEUE "+ nexturl +", referer="+referrerhash +", initiator="+initiatorHash +", name="+name +", load="+loadDate +", depth="+currentdepth);
-        
-        // check first before we create a big object
-        if (this.urlEntryCache.has(nexturl.hash().getBytes())) return;
+        if (log.isFinest()) log.logFinest("ENQUEUE "+ entry.url() +", referer="+entry.referrerhash() +", initiator="+entry.initiator() +", name="+entry.name() +", load="+entry.loaddate() +", depth="+entry.depth());

-        // now create the big object before we enter the synchronized block
-        final CrawlEntry newEntry = new CrawlEntry(
-                    initiatorHash,
-                    nexturl,
-                    referrerhash,
-                    name,
-                    loadDate,
-                    profile.handle(),
-                    currentdepth,
-                    0,
-                    0
-                    );
-        if (newEntry == null) return;
-        final kelondroRow.Entry newEntryRow = newEntry.toRow();
-                
-        synchronized(this.urlEntryHashCache) {
-            kelondroRow.Entry oldValue;
+        if (prefetchHost(entry.url().getHost())) {
            try {
-                oldValue = this.urlEntryCache.put(newEntryRow);
-            } catch (final IOException e) {
-                oldValue = null;
-            }                        
-            if (oldValue == null) {
-                //System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
-                if (prefetchHost(nexturl.getHost())) {
-                    this.alternateCount++;
-                    this.urlEntryHashCache.addFirst(newEntry.url().hash());
-                    this.dnsHit++;
-                } else {
-                    if ((this.dnsMiss > 0) && (this.alternateCount > 2 * this.dnsHit / this.dnsMiss)) {
-                        this.urlEntryHashCache.addFirst(newEntry.url().hash());
-                        this.alternateCount = 0;
-                        //System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss));
-                    } else {
-                        this.urlEntryHashCache.addLast(newEntry.url().hash());
-                    }
-                    this.dnsMiss++; 
-                }
+                this.fastQueue.put(entry);
+                this.dnsHit++;
+            } catch (InterruptedException e) {
+                e.printStackTrace();
+            }
+        } else {
+            try {
+                this.slowQueue.put(entry);
+                this.dnsMiss++; 
+            } catch (InterruptedException e) {
+                e.printStackTrace();
            }
        }
    }
    
-    
-    
-    private String stackCrawl(final CrawlEntry entry) {
+    public String stackCrawl(final CrawlEntry entry) {
        // stacks a crawl item. The position can also be remote
        // returns null if successful, a reason string if not successful
        //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
--- a/source/de/anomic/crawler/ProtocolLoader.java
+++ b/source/de/anomic/crawler/ProtocolLoader.java
@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import de.anomic.index.indexDocumentMetadata;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.server.serverCore;
+import de.anomic.server.serverProcessorJob;
 import de.anomic.server.logging.serverLog;

 public final class ProtocolLoader {
@ -111,14 +112,15 @@ public final class ProtocolLoader {
        // returns null if everything went fine, a fail reason string if a problem occurred
        indexDocumentMetadata h;
        try {
+            entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
            h = load(entry, parserMode);
            assert h != null;
-            entry.setStatus("loaded");
+            entry.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
            final boolean stored = sb.htEntryStoreProcess(h);
-            entry.setStatus("stored-" + ((stored) ? "ok" : "fail"));
+            entry.setStatus("stored-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
            return (stored) ? null : "not stored";
        } catch (IOException e) {
-            entry.setStatus("error");
+            entry.setStatus("error", serverProcessorJob.STATUS_FINISHED);
            log.logWarning("problem loading " + entry.url().toString());
            return "load error - " + e.getMessage();
        }
--- a/source/de/anomic/data/SitemapParser.java
+++ b/source/de/anomic/data/SitemapParser.java
@ -41,7 +41,6 @@ import org.xml.sax.helpers.DefaultHandler;
 import de.anomic.crawler.CrawlEntry;
 import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.HTTPLoader;
-import de.anomic.crawler.ZURL;
 import de.anomic.http.JakartaCommonsHttpClient;
 import de.anomic.http.JakartaCommonsHttpResponse;
 import de.anomic.http.httpRequestHeader;
@ -272,42 +271,20 @@ public class SitemapParser extends DefaultHandler {
            }

            // URL needs to crawled
-            String error = null;
-            error = this.sb.crawlStacker.stackCrawl(url,
-                                                             null, // this.siteMapURL.toString(),
-                                                             this.sb.webIndex.seedDB.mySeed().hash, this.nextURL, new Date(),
-                                                             0, this.crawlingProfile);
-
-            if (error != null) {
-                try {
-                    this.logger.logInfo("The URL '" + this.nextURL + "' can not be crawled. Reason: " + error);
-
-                    // insert URL into the error DB
-                    final ZURL.Entry ee = this.sb.crawlQueues.errorURL.newEntry(
-                            new CrawlEntry(
-                                    sb.webIndex.seedDB.mySeed().hash, 
-                                    new yacyURL(this.nextURL, null), 
-                                    "", 
-                                    "", 
-                                    new Date(),
-                                    null,
-                                    0, 
-                                    0, 
-                                    0),
-                            this.sb.webIndex.seedDB.mySeed().hash,
-                            new Date(),
-                            1,
-                            error);
-                    ee.store();
-                    this.sb.crawlQueues.errorURL.push(ee);
-                } catch (final MalformedURLException e) {/* ignore this */
-                }
-            } else {
-                this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
-
-                // count successfully added URLs
-                this.urlCounter++;
-            }
+            this.sb.crawlStacker.enqueueEntry(new CrawlEntry(
+                    this.sb.webIndex.seedDB.mySeed().hash,
+                    url,
+                    null, // this.siteMapURL.toString(),
+                    this.nextURL,
+                    new Date(),
+                    null,
+                    this.crawlingProfile.handle(),
+                    0,
+                    0,
+                    0
+                    ));
+            this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
+            this.urlCounter++;
        }
    }

--- a/source/de/anomic/data/bookmarksDB.java
+++ b/source/de/anomic/data/bookmarksDB.java
@ -62,7 +62,6 @@ import org.xml.sax.SAXException;

 import de.anomic.crawler.CrawlEntry;
 import de.anomic.crawler.CrawlProfile;
-import de.anomic.crawler.ZURL;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterWriter;
 import de.anomic.index.indexWord;
@ -259,49 +258,37 @@ public class bookmarksDB {
 	                        crawlingQ,
 	                        indexText, indexMedia,
 	                        storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
-	                String reasonString = sb.crawlStacker.stackCrawl(crawlingStartURL, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
-	                
-	                if (reasonString == null) {
-	                	serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
-	                	// serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter);
-	                	// generate a YaCyNews if the global flag was set
-	                    if (crawlOrder) {
-	                        Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
-	                        m.remove("specificDepth");
-	                        m.remove("indexText");
-	                        m.remove("indexMedia");
-	                        m.remove("remoteIndexing");
-	                        m.remove("xsstopw");
-	                        m.remove("xpstopw");
-	                        m.remove("xdstopw");
-	                        m.remove("storeTXCache");
-	                        m.remove("storeHTCache");
-	                        m.remove("generalFilter");
-	                        m.remove("specificFilter");
-	                        m.put("intention", "Automatic ReCrawl!");
-	                        sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));	                      
-	                    }                    
-	                } else {
-	                	serverLog.logInfo("BOOKMARKS", "autoReCrawl - error adding crawl profile: " + crawlingStart + "- " + reasonString);                	
-	                	ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
-	                            new CrawlEntry(
-	                                    sb.webIndex.seedDB.mySeed().hash, 
-	                                    crawlingStartURL, 
-	                                    "", 
-	                                    "", 
-	                                    new Date(),
-	                                    pe.handle(),
-	                                    0, 
-	                                    0, 
-	                                    0),
-	                            sb.webIndex.seedDB.mySeed().hash,
-	                            new Date(),
-	                            1,
-	                            reasonString);
-	                    
-	                    ee.store();
-	                    sb.crawlQueues.errorURL.push(ee);
-	                }              
+	                sb.crawlStacker.enqueueEntry(new CrawlEntry(
+	                        sb.webIndex.seedDB.mySeed().hash,
+                            crawlingStartURL,
+	                        null,
+	                        "CRAWLING-ROOT",
+	                        new Date(),
+	                        null,
+	                        pe.handle(),
+	                        0,
+	                        0,
+	                        0
+	                        ));
+                	serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
+                	// serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter);
+                	// generate a YaCyNews if the global flag was set
+                    if (crawlOrder) {
+                        Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
+                        m.remove("specificDepth");
+                        m.remove("indexText");
+                        m.remove("indexMedia");
+                        m.remove("remoteIndexing");
+                        m.remove("xsstopw");
+                        m.remove("xpstopw");
+                        m.remove("xdstopw");
+                        m.remove("storeTXCache");
+                        m.remove("storeHTCache");
+                        m.remove("generalFilter");
+                        m.remove("specificFilter");
+                        m.put("intention", "Automatic ReCrawl!");
+                        sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));	                      
+                    }
 	    		} catch (MalformedURLException e1) {}
 			} // if
 		} // while(bit.hasNext())    	
--- a/source/de/anomic/index/indexURLReference.java
+++ b/source/de/anomic/index/indexURLReference.java
@ -462,6 +462,7 @@ public class indexURLReference {
                comp().url(), 
                referrerHash(), 
                comp().dc_title(),
+                null,
                loaddate(), 
                null,
                0, 
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -578,8 +578,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        
        deployThread(plasmaSwitchboardConstants.CLEANUP, "Cleanup", "simple cleaning process for monitoring information", null,
                     new serverInstantBusyThread(this, plasmaSwitchboardConstants.CLEANUP_METHOD_START, plasmaSwitchboardConstants.CLEANUP_METHOD_JOBCOUNT, plasmaSwitchboardConstants.CLEANUP_METHOD_FREEMEM), 600000); // all 5 Minutes, wait 10 minutes until first run
-        deployThread(plasmaSwitchboardConstants.CRAWLSTACK, "Crawl URL Stacker", "process that checks url for double-occurrences and for allowance/disallowance by robots.txt", null,
+        deployThread(plasmaSwitchboardConstants.CRAWLSTACK0, "Crawl URL Stacker", "process that checks url for double-occurrences and for allowance/disallowance by robots.txt", null,
                     new serverInstantBusyThread(crawlStacker, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_START, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_JOBCOUNT, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_FREEMEM), 8000);
+        deployThread(plasmaSwitchboardConstants.CRAWLSTACK1, "Crawl URL Stacker", "process that checks url for double-occurrences and for allowance/disallowance by robots.txt", null,
+                    new serverInstantBusyThread(crawlStacker, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_START, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_JOBCOUNT, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_FREEMEM), 8000);
        deployThread(plasmaSwitchboardConstants.INDEXER, "Indexing", "thread that either initiates a parsing/indexing queue, distributes the index into the DHT, stores parsed documents or flushes the index cache", "/IndexCreateIndexingQueue_p.html",
                     new serverInstantBusyThread(this, plasmaSwitchboardConstants.INDEXER_METHOD_START, plasmaSwitchboardConstants.INDEXER_METHOD_JOBCOUNT, plasmaSwitchboardConstants.INDEXER_METHOD_FREEMEM), 10000);
        deployThread(plasmaSwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL, "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer", null,
@ -716,6 +718,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
            synchronized (this.webIndex) {
                this.webIndex.close();
            }
+            // TODO: restart CrawlStacker
            setConfig("network.unit.definition", networkDefinition);
            overwriteNetworkDefinition();
            final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT);
@ -1557,7 +1560,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
                nextEntry = i.next();
                nextUrl = nextEntry.getKey();
                // enqueue the hyperlink into the pre-notice-url db
-                crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), entry.initiator(), nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
+                crawlStacker.enqueueEntry(new CrawlEntry(
+                        entry.initiator(),
+                        nextUrl,
+                        entry.urlHash(),
+                        nextEntry.getValue(),
+                        null,
+                        docDate,
+                        entry.profile().handle(),
+                        entry.depth() + 1,
+                        0,
+                        0
+                        ));
            }
            final long stackEndTime = System.currentTimeMillis();
            if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
@ -2049,7 +2063,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
                url, 
                referrerHash, 
                (name == null) ? "" : name, 
-                new Date(), 
+                new Date(),
+                null,
                null,
                0, 
                0, 
--- a/source/de/anomic/plasma/plasmaSwitchboardConstants.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardConstants.java
@ -140,7 +140,8 @@ public final class plasmaSwitchboardConstants {
     * <p><code>public static final String <strong>CRAWLSTACK</strong> = "82_crawlstack"</code></p>
     * <p>Name of the crawl stacker thread, performing several checks on new URLs to crawl, i.e. double-check</p>
     */
-    public static final String CRAWLSTACK                   = "82_crawlstack";
+    public static final String CRAWLSTACK0                   = "82_crawlstack";
+    public static final String CRAWLSTACK1                   = "83_crawlstack";
    public static final String CRAWLSTACK_METHOD_START      = "job";
    public static final String CRAWLSTACK_METHOD_JOBCOUNT   = "size";
    public static final String CRAWLSTACK_METHOD_FREEMEM    = null;
--- a/source/de/anomic/server/serverDomains.java
+++ b/source/de/anomic/server/serverDomains.java
@ -389,6 +389,7 @@ public class serverDomains {
    public static final int TLD_NorthAmericaOceania_ID = 4; // english-speaking countries
    public static final int TLD_Africa_ID              = 5; // africa
    public static final int TLD_Generic_ID             = 6; // anything else, also raw ip numbers
+    public static final int TLD_Local_ID               = 7; // a local address

    static {
        // assign TLD-ids and names
@ -552,7 +553,7 @@ public class serverDomains {
        }
        final Integer i = TLDID.get(tld);
        if (i == null) {
-            return (isLocal(host)) ? 7 : TLD_Generic_ID;
+            return (isLocal(host)) ? TLD_Local_ID : TLD_Generic_ID;
        }
        return i.intValue();
    }
--- a/source/de/anomic/server/serverInstantBlockingThread.java
+++ b/source/de/anomic/server/serverInstantBlockingThread.java
@ -76,7 +76,7 @@ public class serverInstantBlockingThread<J extends serverProcessorJob> extends s
        
    @SuppressWarnings("unchecked")
    public J job(final J next) throws Exception {
-        if (next == null) return null; // poison pill: shutdown
+        if (next == null || next == serverProcessorJob.poisonPill) return null; // poison pill: shutdown
        instantThreadCounter++;
        //System.out.println("started job " + this.handle + ": " + this.getName());
        jobs.put(this.handle, this.getName());
--- a/source/de/anomic/server/serverProcessorJob.java
+++ b/source/de/anomic/server/serverProcessorJob.java
@ -32,7 +32,7 @@ public class serverProcessorJob {
    public final static int STATUS_FINISHED  =  3;
    public final static int STATUS_POISON    = 99;
    
-    public int status = 0;
+    public int status = STATUS_INITIATED;
    
    public serverProcessorJob() {
        this.status = STATUS_INITIATED;
--- a/source/de/anomic/urlRedirector/urlRedirectord.java
+++ b/source/de/anomic/urlRedirector/urlRedirectord.java
@ -7,6 +7,7 @@ import java.io.PrintWriter;
 import java.net.MalformedURLException;
 import java.util.Date;

+import de.anomic.crawler.CrawlEntry;
 import de.anomic.crawler.CrawlProfile;
 import de.anomic.data.userDB;
 import de.anomic.http.HttpClient;
@ -195,15 +196,18 @@ public class urlRedirectord implements serverHandler, Cloneable {
                            sb.crawlQueues.errorURL.remove(urlhash);                            
                            
                            // enqueuing URL for crawling
-                            sb.crawlStacker.enqueueEntry(
+                            sb.crawlStacker.enqueueEntry(new CrawlEntry(
+                                    sb.webIndex.seedDB.mySeed().hash, 
                                    reqURL, 
                                    null, 
-                                    sb.webIndex.seedDB.mySeed().hash, 
                                    "URL Redirector", 
                                    new Date(), 
+                                    null,
+                                    profile.handle(),
                                    0, 
-                                    profile
-                            );   
+                                    0,
+                                    0
+                            ));   
                        } else {
                            reasonString = "Unsupporte file extension";
                        }