- refactoring of CrawlStacker (to prepare it for new multi-Threading to remove DNS lookup bottleneck)

- fix of shallBeOwnWord target computation heuristic git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5392 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 2802138787
parent b1e211b258
commit 2802138787
12 changed files with 208 additions and 276 deletions
--- a/htroot/ConfigAccounts_p.java
+++ b/htroot/ConfigAccounts_p.java
@ -62,7 +62,7 @@ public class ConfigAccounts_p {
            }
            
            if (localhostAccess) {
-                if (sb.acceptLocalURLs) {
+                if (sb.crawlStacker.acceptLocalURLs()) {
                    // in this case it is not allowed to use a localhostAccess option
                    prop.put("commitIntranetWarning", 1);
                    localhostAccess = false;
--- a/htroot/IndexTransfer_p.java
+++ b/htroot/IndexTransfer_p.java
@ -105,14 +105,12 @@ public final class IndexTransfer_p {
        } else {
            if (!prop.containsKey("running_status")) prop.put("running_status","Not running");
        }
-        
-        
-        
+
        //List known hosts
        yacySeed seed;
        int hc = 0;
        if ((sb.webIndex.seedDB != null) && (sb.webIndex.seedDB.sizeConnected() > 0)) {
-            final Iterator<yacySeed> e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, "AAAAAAAAAAAA", sb.webIndex.seedDB.sizeConnected());
+            final Iterator<yacySeed> e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, null, sb.webIndex.seedDB.sizeConnected(), false);
            final TreeMap<String, String> hostList = new TreeMap<String, String>();
            while (e.hasNext()) {
                seed = e.next();
--- a/htroot/rct_p.java
+++ b/htroot/rct_p.java
@ -72,7 +72,7 @@ public class rct_p {
                            loaddate = new Date();
                        }
                        final yacyURL referrer = null; // referrer needed!
-                        final String urlRejectReason = sb.acceptURL(url);
+                        final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url);
                        if (urlRejectReason == null) {
                            // stack url
                            if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
--- a/htroot/yacy/crawlReceipt.java
+++ b/htroot/yacy/crawlReceipt.java
@ -127,7 +127,7 @@ public final class crawlReceipt {
        }
        
        // check if the entry is in our network domain
-        final String urlRejectReason = sb.acceptURL(comp.url());
+        final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(comp.url());
        if (urlRejectReason != null) {
            log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
            prop.put("delay", "9999");
--- a/htroot/yacy/transferURL.java
+++ b/htroot/yacy/transferURL.java
@ -128,7 +128,7 @@ public final class transferURL {
                }
                
                // check if the entry is in our network domain
-                final String urlRejectReason = sb.acceptURL(comp.url());
+                final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(comp.url());
                if (urlRejectReason != null) {
                    if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: blocked URL '" + comp.url() + "' (" + urlRejectReason + ") from peer " + otherPeerName);
                    lEntry = null;
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -393,7 +393,7 @@ public class CrawlQueues {
            } catch (final ParseException e) {
                loaddate = new Date();
            }
-            final String urlRejectReason = sb.acceptURL(url);
+            final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url);
            if (urlRejectReason == null) {
                // stack url
                if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -28,151 +28,73 @@

 package de.anomic.crawler;

-import java.io.File;
 import java.io.IOException;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
 import java.util.Date;
-import java.util.Iterator;
 import java.util.LinkedList;

 import de.anomic.index.indexReferenceBlacklist;
 import de.anomic.index.indexURLReference;
-import de.anomic.kelondro.kelondroCache;
-import de.anomic.kelondro.kelondroEcoTable;
-import de.anomic.kelondro.kelondroException;
 import de.anomic.kelondro.kelondroIndex;
 import de.anomic.kelondro.kelondroRow;
 import de.anomic.kelondro.kelondroRowSet;
-import de.anomic.kelondro.kelondroTree;
 import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.plasmaWordIndex;
 import de.anomic.server.serverDomains;
 import de.anomic.server.logging.serverLog;
 import de.anomic.yacy.yacyURL;

-public final class CrawlStacker extends Thread {
-    
-    private static final int EcoFSBufferSize = 20;
-    private static String stackfile = "urlNoticeStacker9.db";
-    
-    // keys for different database types
-    public static final int QUEUE_DB_TYPE_RAM  = 0;
-    public static final int QUEUE_DB_TYPE_TREE = 1;
-    public static final int QUEUE_DB_TYPE_ECO = 2;
+public final class CrawlStacker {
    
    final serverLog log = new serverLog("STACKCRAWL");
    
-    private final plasmaSwitchboard sb;
-    private final LinkedList<String> urlEntryHashCache;
-    private kelondroIndex urlEntryCache;
-    private final File cacheStacksPath;
-    private final int dbtype;
-    private final boolean prequeue;
-    private long dnsHit, dnsMiss;
-    private int alternateCount;
-    
+    private final LinkedList<String> urlEntryHashCache; // the order how this queue is processed; entries with known DNS entries go first
+    private kelondroIndex            urlEntryCache;     // the entries in the queue
+    private long                     dnsHit, dnsMiss;
+    private int                      alternateCount;
+    private CrawlQueues              nextQueue;
+    private plasmaWordIndex          wordIndex;
+    private boolean                  acceptLocalURLs, acceptGlobalURLs;
    
    // objects for the prefetch task
    private final ArrayList<String> dnsfetchHosts = new ArrayList<String>();    
    
-    public CrawlStacker(final plasmaSwitchboard sb, final File dbPath, final int dbtype, final boolean prequeue) {
-        this.sb = sb;
-        this.prequeue = prequeue;
+    
+    // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
+    
+    public CrawlStacker(CrawlQueues cq, plasmaWordIndex wordIndex, boolean acceptLocalURLs, boolean acceptGlobalURLs) {
+        this.nextQueue = cq;
+        this.wordIndex = wordIndex;
        this.dnsHit = 0;
        this.dnsMiss = 0;
        this.alternateCount = 0;
+        this.acceptLocalURLs = acceptLocalURLs;
+        this.acceptGlobalURLs = acceptGlobalURLs;
        
        // init the message list
        this.urlEntryHashCache = new LinkedList<String>();
-        
-        // create a stack for newly entered entries
-        this.cacheStacksPath = dbPath;
-        this.dbtype = dbtype;
-
-        openDB();
-        try {
-            // loop through the list and fill the messageList with url hashs
-            final Iterator<kelondroRow.Entry> rows = this.urlEntryCache.rows(true, null);
-            kelondroRow.Entry entry;
-            while (rows.hasNext()) {
-                entry = rows.next();
-                if (entry == null) {
-                    System.out.println("ERROR! null element found");
-                    continue;
-                }
-                this.urlEntryHashCache.add(entry.getColString(0, null));
-            }
-        } catch (final kelondroException e) {
-            /* if we have an error, we start with a fresh database */
-            CrawlStacker.this.log.logSevere("Unable to initialize crawl stacker queue, kelondroException:" + e.getMessage() + ". Reseting DB.\n", e);
-
-            // deleting old db and creating a new db
-            try {this.urlEntryCache.close();} catch (final Exception ex) {}
-            deleteDB();
-            openDB();
-        } catch (final IOException e) {
-            /* if we have an error, we start with a fresh database */
-            CrawlStacker.this.log.logSevere("Unable to initialize crawl stacker queue, IOException:" + e.getMessage() + ". Reseting DB.\n", e);

-            // deleting old db and creating a new db
-            try {this.urlEntryCache.close();} catch (final Exception ex) {}
-            deleteDB();
-            openDB();
-        }
-        this.log.logInfo(size() + " entries in the stackCrawl queue.");
-        this.start(); // start the prefetcher thread
+        this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0);
        this.log.logInfo("STACKCRAWL thread initialized.");
    }

-    public void run() {
-        String nextHost;
-        try {
-            while (!Thread.currentThread().isInterrupted()) { // action loop
-                if (dnsfetchHosts.size() == 0) synchronized (this) { wait(); }
-                synchronized (dnsfetchHosts) {
-                    nextHost = dnsfetchHosts.remove(dnsfetchHosts.size() - 1);
-                }
-                try {
-                    serverDomains.dnsResolve(nextHost);
-                } catch (final Exception e) {}
-            }
-        } catch (final InterruptedException e) {}
-    }       
-
-    public boolean prefetchHost(final String host) {
-        // returns true when the host was known in the dns cache.
-        // If not, the host is stacked on the fetch stack and false is returned
-        try {
-            serverDomains.dnsResolveFromCache(host);
-            return true;
-        } catch (final UnknownHostException e) {
-            synchronized (this) {
-                dnsfetchHosts.add(host);
-                notifyAll();
-            }
-            return false;
-        }
-    }
-    
-    public void terminateDNSPrefetcher() {
-        synchronized (this) {
-            interrupt();
+    public int size() {
+        synchronized (this.urlEntryHashCache) {
+            return this.urlEntryHashCache.size();
        }
    }
-    
+
    public void clear() throws IOException {
        this.urlEntryHashCache.clear();
        this.urlEntryCache.clear();
    }
    
    public void close() {
-        if (this.dbtype == QUEUE_DB_TYPE_RAM) {
-            this.log.logInfo("Shutdown. Flushing remaining " + size() + " crawl stacker job entries. please wait.");
-            while (size() > 0) {
-                if (!job()) break;
-            }
+        this.log.logInfo("Shutdown. Flushing remaining " + size() + " crawl stacker job entries. please wait.");
+        while (size() > 0) {
+            if (!job()) break;
        }
-        terminateDNSPrefetcher();
        
        this.log.logInfo("Shutdown. Closing stackCrawl queue.");

@ -182,26 +104,68 @@ public final class CrawlStacker extends Thread {
        // clearing the hash list
        this.urlEntryHashCache.clear();
    }
+
+    private boolean prefetchHost(final String host) {
+        // returns true when the host was known in the dns cache.
+        // If not, the host is stacked on the fetch stack and false is returned
+        try {
+            serverDomains.dnsResolveFromCache(host);
+            return true;
+        } catch (final UnknownHostException e) {
+            synchronized (this) {
+                dnsfetchHosts.add(host);
+                notifyAll();
+            }
+            return false;
+        }
+    }
    
    public boolean job() {
-        CrawlEntry entry;
+        // this is the method that is called by the busy thread from outside
+        if (this.urlEntryHashCache.size() == 0) return false;
+        
+        // get the next entry from the queue
+        String urlHash = null;
+        kelondroRow.Entry ec = null;
+        synchronized (this.urlEntryHashCache) {
+            urlHash = this.urlEntryHashCache.removeFirst();
+            if (urlHash == null) {
+                urlEntryHashCache.clear();
+                try {
+                    urlEntryCache.clear();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+                return false;
+            }
+            try {
+                ec = this.urlEntryCache.remove(urlHash.getBytes());
+            } catch (IOException e) {
+                e.printStackTrace();
+                return false;
+            }
+        }
+        if (urlHash == null || ec == null) return false;
+        
+        // make a crawl Entry out of it
+        CrawlEntry entry = null;
        try {
-            entry = dequeueEntry();
-        } catch (final IOException e) {
-            e.printStackTrace();
+            entry = new CrawlEntry(ec);
+        } catch (IOException e1) {
+            e1.printStackTrace();
            return false;
        }
+            
        if (entry == null) return false;

        try {
-
-            final String rejectReason = sb.crawlStacker.stackCrawl(entry);
+            final String rejectReason = stackCrawl(entry);

            // if the url was rejected we store it into the error URL db
            if (rejectReason != null) {
-                final ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, rejectReason);
+                final ZURL.Entry ee = nextQueue.errorURL.newEntry(entry, wordIndex.seedDB.mySeed().hash, new Date(), 1, rejectReason);
                ee.store();
-                sb.crawlQueues.errorURL.push(ee);
+                nextQueue.errorURL.push(ee);
            }
        } catch (final Exception e) {
            CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
@ -270,8 +234,6 @@ public final class CrawlStacker extends Thread {
                
        synchronized(this.urlEntryHashCache) {
            kelondroRow.Entry oldValue;
-            boolean hostknown = true;
-            if (prequeue) hostknown = prefetchHost(nexturl.getHost());
            try {
                oldValue = this.urlEntryCache.put(newEntryRow);
            } catch (final IOException e) {
@ -279,7 +241,7 @@ public final class CrawlStacker extends Thread {
            }                        
            if (oldValue == null) {
                //System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
-                if (hostknown) {
+                if (prefetchHost(nexturl.getHost())) {
                    this.alternateCount++;
                    this.urlEntryHashCache.addFirst(newEntry.url().hash());
                    this.dnsHit++;
@ -297,79 +259,9 @@ public final class CrawlStacker extends Thread {
        }
    }
    
-    private void deleteDB() {
-        if (this.dbtype == QUEUE_DB_TYPE_RAM) {
-            // do nothing..
-            return;
-        }
-        if (this.dbtype == QUEUE_DB_TYPE_ECO) {
-            new File(cacheStacksPath, stackfile).delete();
-            //kelondroFlexWidthArray.delete(cacheStacksPath, stackfile);
-        }
-        if (this.dbtype == QUEUE_DB_TYPE_TREE) {
-            final File cacheFile = new File(cacheStacksPath, stackfile);
-            cacheFile.delete();
-        }
-    }
-
-    private void openDB() {
-        if (!(cacheStacksPath.exists())) cacheStacksPath.mkdir(); // make the path
-
-        if (this.dbtype == QUEUE_DB_TYPE_RAM) {
-            this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0);
-        }
-        if (this.dbtype == QUEUE_DB_TYPE_ECO) {
-            cacheStacksPath.mkdirs();
-            final File f = new File(cacheStacksPath, stackfile);
-            try {
-                this.urlEntryCache = new kelondroEcoTable(f, CrawlEntry.rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0);
-                //this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, CrawlEntry.rowdef, 0, true));
-            } catch (final Exception e) {
-                e.printStackTrace();
-                // kill DB and try again
-                f.delete();
-                //kelondroFlexTable.delete(cacheStacksPath, newCacheName);
-                try {
-                    this.urlEntryCache = new kelondroEcoTable(f, CrawlEntry.rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0);
-                    //this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, CrawlEntry.rowdef, 0, true));
-                } catch (final Exception ee) {
-                    ee.printStackTrace();
-                    System.exit(-1);
-                }
-            }
-        }
-        if (this.dbtype == QUEUE_DB_TYPE_TREE) {
-            final File cacheFile = new File(cacheStacksPath, stackfile);
-            cacheFile.getParentFile().mkdirs();
-            this.urlEntryCache = new kelondroCache(kelondroTree.open(cacheFile, true, 0, CrawlEntry.rowdef));
-        }
-    }
-
-    public int size() {
-        synchronized (this.urlEntryHashCache) {
-            return this.urlEntryHashCache.size();
-        }
-    }
-
-    public int getDBType() {
-        return this.dbtype;
-    }
-
-    public CrawlEntry dequeueEntry() throws IOException {
-        if (this.urlEntryHashCache.size() == 0) return null;
-        String urlHash = null;
-        kelondroRow.Entry entry = null;
-        synchronized (this.urlEntryHashCache) {
-            urlHash = this.urlEntryHashCache.removeFirst();
-            if (urlHash == null) throw new IOException("urlHash is null");
-            entry = this.urlEntryCache.remove(urlHash.getBytes());
-        }
-
-        if ((urlHash == null) || (entry == null)) return null;
-        return new CrawlEntry(entry);
-    }
    
-    public String stackCrawl(final CrawlEntry entry) {
+    
+    private String stackCrawl(final CrawlEntry entry) {
        // stacks a crawl item. The position can also be remote
        // returns null if successful, a reason string if not successful
        //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
@ -379,7 +271,7 @@ public final class CrawlStacker extends Thread {

        // check if the protocol is supported
        final String urlProtocol = entry.url().getProtocol();
-        if (!sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
+        if (!nextQueue.isSupportedProtocol(urlProtocol)) {
            reason = "unsupported protocol";
            this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " + 
                               "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
@ -387,9 +279,9 @@ public final class CrawlStacker extends Thread {
        }

        // check if ip is local ip address
-        final String urlRejectReason = sb.acceptURL(entry.url());
+        final String urlRejectReason = urlInAcceptedDomain(entry.url());
        if (urlRejectReason != null) {
-            reason = "denied_(" + urlRejectReason + ")_domain=" + sb.getConfig("network.unit.domain", "unknown");
+            reason = "denied_(" + urlRejectReason + ")";
            if (this.log.isFine()) this.log.logFine(reason + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
            return reason;                
        }
@ -402,7 +294,7 @@ public final class CrawlStacker extends Thread {
            return reason;
        }
        
-        final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(entry.profileHandle());
+        final CrawlProfile.entry profile = wordIndex.profilesActiveCrawls.getEntry(entry.profileHandle());
        if (profile == null) {
            final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
            log.logWarning(errorMsg);
@ -443,7 +335,7 @@ public final class CrawlStacker extends Thread {
            return reason;
        }
        
-        final yacyURL referrerURL = (entry.referrerhash() == null) ? null : sb.crawlQueues.getURL(entry.referrerhash());
+        final yacyURL referrerURL = (entry.referrerhash() == null) ? null : nextQueue.getURL(entry.referrerhash());
        
        // add domain to profile domain list
        if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
@ -467,8 +359,8 @@ public final class CrawlStacker extends Thread {
        }

        // check if the url is double registered
-        final String dbocc = sb.crawlQueues.urlExists(entry.url().hash());
-        final indexURLReference oldEntry = this.sb.webIndex.getURL(entry.url().hash(), null, 0);
+        final String dbocc = nextQueue.urlExists(entry.url().hash());
+        final indexURLReference oldEntry = wordIndex.getURL(entry.url().hash(), null, 0);
        final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
        // do double-check
        if ((dbocc != null) && (!recrawl)) {
@ -489,16 +381,16 @@ public final class CrawlStacker extends Thread {
        }
        
        // store information
-        final boolean local = entry.initiator().equals(sb.webIndex.seedDB.mySeed().hash);
-        final boolean proxy = (entry.initiator() == null || entry.initiator().equals("------------")) && profile.handle().equals(this.sb.webIndex.defaultProxyProfile.handle());
-        final boolean remote = profile.handle().equals(this.sb.webIndex.defaultRemoteProfile.handle());
+        final boolean local = entry.initiator().equals(wordIndex.seedDB.mySeed().hash);
+        final boolean proxy = (entry.initiator() == null || entry.initiator().equals("------------")) && profile.handle().equals(wordIndex.defaultProxyProfile.handle());
+        final boolean remote = profile.handle().equals(wordIndex.defaultRemoteProfile.handle());
        final boolean global = 
            (profile.remoteIndexing()) /* granted */ &&
            (entry.depth() == profile.depth()) /* leaf node */ && 
            //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
            (
-                    (sb.webIndex.seedDB.mySeed().isSenior()) ||
-                    (sb.webIndex.seedDB.mySeed().isPrincipal())
+                    (wordIndex.seedDB.mySeed().isSenior()) ||
+                    (wordIndex.seedDB.mySeed().isPrincipal())
            ) /* qualified */;
        
        if (!local && !global && !remote && !proxy) {
@ -508,23 +400,62 @@ public final class CrawlStacker extends Thread {
                // it may be possible that global == true and local == true, so do not check an error case against it
                if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
                if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
-                sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry);
+                nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry);
            }
            if (local) {
                if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
                if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
-                sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
+                nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
            }
            if (proxy) {
                if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
-                sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
+                nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
            }
            if (remote) {
-                sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry);
+                nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry);
            }
-            
        }
+        
        return null;
    }
    
+
+    /**
+     * Test a url if it can be used for crawling/indexing
+     * This mainly checks if the url is in the declared domain (local/global)
+     * @param url
+     * @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted
+     */
+    public String urlInAcceptedDomain(final yacyURL url) {
+        // returns true if the url can be accepted accoring to network.unit.domain
+        if (url == null) return "url is null";
+        final String host = url.getHost();
+        if (host == null) return "url.host is null";
+        if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
+        /*
+        InetAddress hostAddress = serverDomains.dnsResolve(host);
+        // if we don't know the host, we cannot load that resource anyway.
+        // But in case we use a proxy, it is possible that we dont have a DNS service.
+        final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig();
+        if (hostAddress == null) {
+            if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved";
+        }
+        */
+        // check if this is a local address and we are allowed to index local pages:
+        //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
+        final boolean local = url.isLocal();
+        //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
+        if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
+        return (local) ?
+            ("the host '" + host + "' is local, but local addresses are not accepted") :
+            ("the host '" + host + "' is global, but global addresses are not accepted");
+    }
+    
+    public boolean acceptLocalURLs() {
+        return this.acceptLocalURLs;
+    }
+    
+    public boolean acceptGlobalURLs() {
+        return this.acceptGlobalURLs;
+    }
 }
--- a/source/de/anomic/plasma/plasmaSearchAPI.java
+++ b/source/de/anomic/plasma/plasmaSearchAPI.java
@ -73,7 +73,7 @@ public class plasmaSearchAPI {
        yacySeed seed;
        int hc = 0;
        prop.put("searchresult_keyhash", startHash);
-        final Iterator<yacySeed> e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, startHash, sb.webIndex.seedDB.sizeConnected());
+        final Iterator<yacySeed> e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, startHash, sb.webIndex.seedDB.sizeConnected(), true);
        while (e.hasNext()) {
            seed = e.next();
            if (seed != null) {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -234,7 +234,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
    public  int                            totalPPM = 0;
    public  double                         totalQPM = 0d;
    public  TreeMap<String, String>        clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used
-    public  boolean                        acceptLocalURLs, acceptGlobalURLs;
    public  URLLicense                     licensedURLs;
    public  Timer                          moreMemory;
    
@ -548,9 +547,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        this.observer.resourceObserverJob();
        
        // initializing the stackCrawlThread
-        this.crawlStacker = new CrawlStacker(this, this.plasmaPath, (int) getConfigLong("tableTypeForPreNURL", 0), (((int) getConfigLong("tableTypeForPreNURL", 0) == 0) && (getConfigLong(plasmaSwitchboardConstants.CRAWLSTACK_BUSYSLEEP, 0) <= 100)));
-        //this.sbStackCrawlThread = new plasmaStackCrawlThread(this,this.plasmaPath,ramPreNURL);
-        //this.sbStackCrawlThread.start();
+        this.crawlStacker = new CrawlStacker(
+                crawlQueues,
+                this.webIndex,
+                "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
+                "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0);
        
        // initializing dht chunk generation
        this.dhtTransferChunk = null;
@ -680,10 +681,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        // initiate url license object
        licensedURLs = new URLLicense(8);
        
-        // set URL domain acceptance
-        acceptGlobalURLs = "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
-        acceptLocalURLs = "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
-        
        /*
        // in intranet and portal network set robinson mode
        if (networkUnitDefinition.equals("defaults/yacy.network.webportal.unit") ||
@ -736,7 +733,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        // that an automatic authorization of localhost is done, because in this case crawls from local
        // addresses are blocked to prevent attack szenarios where remote pages contain links to localhost
        // addresses that can steer a YaCy peer
-        if ((this.acceptLocalURLs) && (getConfigBool("adminAccountForLocalhost", false))) {
+        if ((crawlStacker.acceptLocalURLs()) && (getConfigBool("adminAccountForLocalhost", false))) {
            setConfig("adminAccountForLocalhost", false);
            if (getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").startsWith("0000")) {
                // the password was set automatically with a random value.
@ -856,36 +853,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
    	}
    }
   
-    /**
-     * Test a url if it can be used for crawling/indexing
-     * This mainly checks if the url is in the declared domain (local/global)
-     * @param url
-     * @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted
-     */
-    public String acceptURL(final yacyURL url) {
-        // returns true if the url can be accepted accoring to network.unit.domain
-        if (url == null) return "url is null";
-        final String host = url.getHost();
-        if (host == null) return "url.host is null";
-        if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
-        /*
-        InetAddress hostAddress = serverDomains.dnsResolve(host);
-        // if we don't know the host, we cannot load that resource anyway.
-        // But in case we use a proxy, it is possible that we dont have a DNS service.
-        final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig();
-        if (hostAddress == null) {
-            if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved";
-        }
-        */
-        // check if this is a local address and we are allowed to index local pages:
-        //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
-        final boolean local = url.isLocal();
-        //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
-        if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
-        return (local) ?
-            ("the host '" + host + "' is local, but local addresses are not accepted") :
-            ("the host '" + host + "' is global, but global addresses are not accepted");
-    }
    
    public String urlExists(final String hash) {
        // tests if hash occurrs in any database
@ -992,7 +959,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
         * 
         * check if ip is local ip address // TODO: remove this procotol specific code here
         * ========================================================================= */
-        final String urlRejectReason = acceptURL(entry.url());
+        final String urlRejectReason = crawlStacker.urlInAcceptedDomain(entry.url());
        if (urlRejectReason != null) {
            if (this.log.isFine()) this.log.logFine("Rejected URL '" + entry.url() + "': " + urlRejectReason);
            doIndexing = false;
@ -1298,7 +1265,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
            }
            
            // set a random password if no password is configured
-            if (!this.acceptLocalURLs && getConfigBool("adminAccountForLocalhost", false) && getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").length() == 0) {
+            if (!crawlStacker.acceptLocalURLs() && getConfigBool("adminAccountForLocalhost", false) && getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").length() == 0) {
                // make a 'random' password
                setConfig(httpd.ADMIN_ACCOUNT_B64MD5, "0000" + serverCodings.encodeMD5Hex(System.getProperties().toString() + System.currentTimeMillis()));
                setConfig("adminAccount", "");
@ -1998,7 +1965,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        try {
            // find a list of DHT-peers
            if (log != null) log.logInfo("Collecting DHT target peers for first_hash = " + dhtChunk.firstContainer().getWordHash() + ", last_hash = " + dhtChunk.lastContainer().getWordHash());
-            final Iterator<yacySeed> seedIter = yacyPeerSelection.getAcceptRemoteIndexSeeds(webIndex.seedDB, dhtChunk.lastContainer().getWordHash(), peerCount + 9);
+            final Iterator<yacySeed> seedIter = yacyPeerSelection.getAcceptRemoteIndexSeeds(webIndex.seedDB, dhtChunk.lastContainer().getWordHash(), peerCount + 9, false);

            // send away the indexes to all these peers
            int hc1 = 0;
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@ -548,7 +548,7 @@ public final class yacyClient {
 				continue; // block with backlist
 			}
            
-			final String urlRejectReason = plasmaSwitchboard.getSwitchboard().acceptURL(comp.url());
+			final String urlRejectReason = plasmaSwitchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(comp.url());
            if (urlRejectReason != null) {
                yacyCore.log.logInfo("remote search (client): rejected url '" + comp.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
                continue; // reject url outside of our domain
--- a/source/de/anomic/yacy/yacyPeerSelection.java
+++ b/source/de/anomic/yacy/yacyPeerSelection.java
@ -28,6 +28,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;

+import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.kelondro.kelondroException;
 import de.anomic.kelondro.kelondroMScoreCluster;
 import de.anomic.server.serverDate;
@ -48,7 +49,7 @@ public class yacyPeerSelection {
        long distance;
        for (int v = 0; v < dhtVerticalTargets.length; v++) {
            wordhash = yacySeed.positionToHash(dhtVerticalTargets[v]);
-            Iterator<yacySeed> dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy);
+            Iterator<yacySeed> dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy, false);
            int c = Math.min(seedDB.sizeConnected(), redundancy);
            int cc = 3; // select a maximum of 3, this is enough redundancy
            while (dhtEnum.hasNext() && c > 0 && cc-- > 0) {
@ -64,23 +65,24 @@ public class yacyPeerSelection {
        }
    }
    
-    public static boolean verifyIfOwnWord(final yacySeedDB seedDB, final String wordhash, int redundancy) {
+    public static boolean verifyIfOwnWord(final yacySeedDB seedDB, String wordhash, int redundancy) {
        String myHash = seedDB.mySeed().hash;
-        long[] dhtVerticalTargets = yacySeed.dhtPositions(wordhash, yacySeed.partitionExponent);
-        for (int v = 0; v < dhtVerticalTargets.length; v++) {
-            Iterator<yacySeed> dhtEnum = getAcceptRemoteIndexSeeds(seedDB, yacySeed.positionToHash(dhtVerticalTargets[v]), redundancy);
+        //long[] dhtVerticalTargets = yacySeed.dhtPositions(wordhash, yacySeed.partitionExponent);
+        //for (int v = 0; v < dhtVerticalTargets.length; v++) {
+            //wordhash = yacySeed.positionToHash(dhtVerticalTargets[0]);
+            Iterator<yacySeed> dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy, true);
            while (dhtEnum.hasNext()) {
-                if (dhtEnum.next().equals(myHash)) return true;
+                if (dhtEnum.next().hash.equals(myHash)) return true;
            }
-        }
+        //}
        return false;
    }
    
-    public static Iterator<yacySeed> getAcceptRemoteIndexSeeds(yacySeedDB seedDB, final String starthash, int max) {
+    public static Iterator<yacySeed> getAcceptRemoteIndexSeeds(yacySeedDB seedDB, final String starthash, int max, boolean alsoMyOwn) {
        // returns an enumeration of yacySeed-Objects
        // that have the AcceptRemoteIndex-Flag set
        // the seeds are enumerated in the right order according DHT
-        return new acceptRemoteIndexSeedEnum(seedDB, starthash, Math.max(max, seedDB.sizeConnected()));
+        return new acceptRemoteIndexSeedEnum(seedDB, starthash, Math.min(max, seedDB.sizeConnected()), alsoMyOwn);
    }
    
    private static class acceptRemoteIndexSeedEnum implements Iterator<yacySeed> {
@ -90,13 +92,15 @@ public class yacyPeerSelection {
        private yacySeedDB seedDB;
        private HashSet<String> doublecheck;
        private int remaining;
+        private boolean alsoMyOwn;
        
-        public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final String starthash, int max) {
+        public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final String starthash, int max, boolean alsoMyOwn) {
            this.seedDB = seedDB;
            this.se = getDHTSeeds(seedDB, starthash, yacyVersion.YACY_HANDLES_COLLECTION_INDEX);
            this.remaining = max;
            this.doublecheck = new HashSet<String>();
            this.nextSeed = nextInternal();
+            this.alsoMyOwn = alsoMyOwn && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
        }
        
        public boolean hasNext() {
@ -127,9 +131,15 @@ public class yacyPeerSelection {
        }
        
        public yacySeed next() {
-            final yacySeed next = nextSeed;
-            nextSeed = nextInternal();
-            return next;
+            if (alsoMyOwn && kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) < 0) {
+                // take my own seed hash instead the enumeration result
+                alsoMyOwn = false;
+                return seedDB.mySeed();
+            } else {
+                final yacySeed next = nextSeed;
+                nextSeed = nextInternal();
+                return next;
+            }
        }

        public void remove() {
--- a/source/de/anomic/yacy/yacySeed.java
+++ b/source/de/anomic/yacy/yacySeed.java
@ -1017,19 +1017,45 @@ public class yacySeed implements Cloneable {

    
    private static int guessedOwn = 0;
+    //private static int guessedNotOwn = 0;
    private static int verifiedOwn = 0;
+    private static int verifiedNotOwn = 0;
    
    public static boolean shallBeOwnWord(final yacySeedDB seedDB, final String wordhash, int redundancy) {
-        if (!guessIfOwnWord(seedDB, wordhash)) return false;
-        guessedOwn++;
-        if (yacyPeerSelection.verifyIfOwnWord(seedDB, wordhash, redundancy)) {
-            verifiedOwn++;
-            System.out.println("*** DEBUG shallBeOwnWord: true. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn);
-            return true;
+        // the guessIfOwnWord is a fast method that should only fail in case that a 'true' may be incorrect, but a 'false' shall always be correct
+        if (guessIfOwnWord(seedDB, wordhash)) {
+            // this case must be verified, because it can be wrong.
+            guessedOwn++;
+            if (yacyPeerSelection.verifyIfOwnWord(seedDB, wordhash, redundancy)) {
+                // this is the correct case, but does not need to be an average case
+                verifiedOwn++;
+                //System.out.println("*** DEBUG shallBeOwnWord: true. guessed: true. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn);
+                return true;
+            } else {
+                // this may happen, but can be corrected
+                verifiedNotOwn++;
+                //System.out.println("*** DEBUG shallBeOwnWord: false. guessed: true. verified/guessed ration = " + verifiedNotOwn + "/" + guessedNotOwn);
+                return false;
+            }
        } else {
-            System.out.println("*** DEBUG shallBeOwnWord: false. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn);
            return false;
+            /*
+            // this should mean that the guessing should not be wrong
+            guessedNotOwn++;
+            if (yacyPeerSelection.verifyIfOwnWord(seedDB, wordhash, redundancy)) {
+                // this should never happen
+                verifiedOwn++;
+                System.out.println("*** DEBUG shallBeOwnWord: true. guessed: false. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn);
+                return true;
+            } else {
+                // this should always happen
+                verifiedNotOwn++;
+                //System.out.println("*** DEBUG shallBeOwnWord: false. guessed: false. verified/guessed ration = " + verifiedNotOwn + "/" + guessedNotOwn);
+                return false;
+            }
+            */
        }
+        
    }
    
    private static boolean guessIfOwnWord(final yacySeedDB seedDB, final String wordhash) {