From c25d7bcb80bd25007340c566831a3dbd8b856fb9 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 29 Oct 2012 21:08:45 +0100
Subject: [PATCH] - added concurrency for robots.txt loading - changed data
 model for domain counter

---
 htroot/CrawlProfileEditor_p.java              |   3 -
 source/net/yacy/crawler/Balancer.java         |  10 +-
 source/net/yacy/crawler/CrawlStacker.java     |  29 ++--
 .../net/yacy/crawler/data/CrawlProfile.java   | 127 ++++++------------
 source/net/yacy/crawler/data/NoticedURL.java  |  19 +--
 source/net/yacy/crawler/robots/RobotsTxt.java | 103 ++++++++++++++
 source/net/yacy/search/Switchboard.java       |   6 +-
 7 files changed, 172 insertions(+), 125 deletions(-)

diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
index 3a5630921..7ab881a48 100644
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@@ -75,9 +75,6 @@ public class CrawlProfileEditor_p {
         labels.add(new eentry(CrawlProfile.INDEX_MEDIA,         "Index Media",           false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.STORE_HTCACHE,       "Store in HTCache",      false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.REMOTE_INDEXING,     "Remote Indexing",       false, eentry.BOOLEAN));
-        labels.add(new eentry(CrawlProfile.XSSTOPW,             "Static stop-words",     false, eentry.BOOLEAN));
-        labels.add(new eentry(CrawlProfile.XDSTOPW,             "Dynamic stop-words",    false, eentry.BOOLEAN));
-        labels.add(new eentry(CrawlProfile.XPSTOPW,             "Parent stop-words",     false, eentry.BOOLEAN));
     }
 
     public static serverObjects respond(
diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java
index be3ea0f28..4926d3ffa 100644
--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@@ -256,7 +256,7 @@ public class Balancer {
      * @throws IOException
      * @throws SpaceExceededException
      */
-    public String push(final Request entry) throws IOException, SpaceExceededException {
+    public String push(final Request entry, final RobotsTxt robots) throws IOException, SpaceExceededException {
         assert entry != null;
         final byte[] hash = entry.url().hash();
         synchronized (this) {
@@ -275,8 +275,9 @@ public class Balancer {
 
 	        // add the hash to a queue
 	        pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
-	        return null;
         }
+        robots.ensureExist(entry.url(), Balancer.this.myAgentIDs, true); // concurrently load all robots.txt
+        return null;
     }
 
     /**
@@ -319,8 +320,7 @@ public class Balancer {
      * @param crawlURL
      * @return
      */
-    private long getRobotsTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) {
-        if (profileEntry == null) return 0;
+    private long getRobotsTime(final RobotsTxt robots, final DigestURI crawlURL) {
         long sleeptime = Latency.waitingRobots(crawlURL, robots, this.myAgentIDs); // this uses the robots.txt database and may cause a loading of robots.txt from the server
         return sleeptime < 0 ? 0 : sleeptime;
     }
@@ -450,7 +450,7 @@ public class Balancer {
     	}
     	if (crawlEntry == null) return null;
 
-    	long robotsTime = getRobotsTime(robots, profileEntry, crawlEntry.url());
+    	long robotsTime = getRobotsTime(robots, crawlEntry.url());
         Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
         if (delay && sleeptime > 0) {
             // force a busy waiting here
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index 1ffae79a9..2a918bfd8 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -37,6 +37,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
 import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.Classification.ContentDomain;
@@ -50,13 +51,13 @@ import net.yacy.crawler.data.CrawlQueues;
 import net.yacy.crawler.data.NoticedURL;
 import net.yacy.crawler.data.ResultURLs;
 import net.yacy.crawler.data.ZURL;
-import net.yacy.crawler.data.CrawlProfile.DomProfile;
 import net.yacy.crawler.data.ResultURLs.EventOrigin;
 import net.yacy.crawler.data.ZURL.FailCategory;
 import net.yacy.crawler.retrieval.FTPLoader;
 import net.yacy.crawler.retrieval.HTTPLoader;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.crawler.retrieval.SMBLoader;
+import net.yacy.crawler.robots.RobotsTxt;
 import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
@@ -75,7 +76,7 @@ public final class CrawlStacker {
     
 
     private final Log log = new Log("STACKCRAWL");
-
+    private final RobotsTxt robots;
     private final WorkflowProcessor<Request>  fastQueue, slowQueue;
     private final CrawlQueues       nextQueue;
     private final CrawlSwitchboard  crawler;
@@ -87,6 +88,7 @@ public final class CrawlStacker {
     // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
 
     public CrawlStacker(
+            final RobotsTxt robots,
             final CrawlQueues cq,
             final CrawlSwitchboard cs,
             final Segment indexSegment,
@@ -94,6 +96,7 @@ public final class CrawlStacker {
             final boolean acceptLocalURLs,
             final boolean acceptGlobalURLs,
             final FilterEngine domainList) {
+        this.robots = robots;
         this.nextQueue = cq;
         this.crawler = cs;
         this.indexSegment = indexSegment;
@@ -366,32 +369,30 @@ public final class CrawlStacker {
             entry.url().getContentDomain() == ContentDomain.AUDIO  ||
             entry.url().getContentDomain() == ContentDomain.VIDEO ||
             entry.url().getContentDomain() == ContentDomain.CTRL) {
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, this.robots);
             //if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
             return null;
         }
-
-        final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : this.nextQueue.getURL(entry.referrerhash());
-
+        
         // add domain to profile domain list
         if (profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
-            profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
+            profile.domInc(entry.url().getHost());
         }
 
         if (global) {
             // it may be possible that global == true and local == true, so do not check an error case against it
             if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
             if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry, this.robots);
         } else if (local) {
             if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
             if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots);
         } else if (proxy) {
             if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots);
         } else if (remote) {
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry, this.robots);
         }
         if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed: " + warning);
 
@@ -479,13 +480,13 @@ public final class CrawlStacker {
         // deny urls that exceed allowed number of occurrences
         final int maxAllowedPagesPerDomain = profile.domMaxPages();
         if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) {
-            final DomProfile dp = profile.getDom(url.getHost());
-            if (dp != null && dp.count >= maxAllowedPagesPerDomain) {
+            final AtomicInteger dp = profile.getCount(url.getHost());
+            if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
                 if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed.");
                 return "crawl stack domain counter exceeded";
             }
 
-            if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= profile.domMaxPages()) {
+            if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) {
                 if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed.");
                 return "result stack domain counter exceeded";
             }
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 2f6aa3d92..6b8ade6be 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -31,6 +31,7 @@ import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
@@ -53,55 +54,34 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
 
     // this is a simple record structure that hold all properties of a single crawl start
-    public static final String HANDLE           = "handle";
-    public static final String NAME             = "name";
-    public static final String DEPTH            = "generalDepth";
-    public static final String DIRECT_DOC_BY_URL= "directDocByURL";
-    public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
-    public static final String DOM_MAX_PAGES    = "domMaxPages";
-    public static final String CRAWLING_Q       = "crawlingQ";
-    public static final String PUSH_SOLR        = "pushSolr";
-    public static final String INDEX_TEXT       = "indexText";
-    public static final String INDEX_MEDIA      = "indexMedia";
-    public static final String STORE_HTCACHE    = "storeHTCache";
-    public static final String REMOTE_INDEXING  = "remoteIndexing";
-    public static final String XSSTOPW          = "xsstopw";
-    public static final String XDSTOPW          = "xdstopw";
-    public static final String XPSTOPW          = "xpstopw";
-    public static final String CACHE_STRAGEGY   = "cacheStrategy";
-    public static final String CRAWLER_URL_MUSTMATCH         = "crawlerURLMustMatch";
-    public static final String CRAWLER_URL_MUSTNOTMATCH      = "crawlerURLMustNotMatch";
-    public static final String CRAWLER_IP_MUSTMATCH          = "crawlerIPMustMatch";
-    public static final String CRAWLER_IP_MUSTNOTMATCH       = "crawlerIPMustNotMatch";
-    public static final String CRAWLER_COUNTRY_MUSTMATCH     = "crawlerCountryMustMatch";
-    public static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch";
-    public static final String INDEXING_URL_MUSTMATCH        = "indexURLMustMatch";
-    public static final String INDEXING_URL_MUSTNOTMATCH     = "indexURLMustNotMatch";
-    public static final String COLLECTIONS = "collections";
+    private static final String HANDLE           = "handle";
+    public  static final String NAME             = "name";
+    public  static final String DEPTH            = "generalDepth";
+    private static final String DIRECT_DOC_BY_URL= "directDocByURL";
+    public  static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
+    public  static final String DOM_MAX_PAGES    = "domMaxPages";
+    public  static final String CRAWLING_Q       = "crawlingQ";
+    public  static final String INDEX_TEXT       = "indexText";
+    public  static final String INDEX_MEDIA      = "indexMedia";
+    public  static final String STORE_HTCACHE    = "storeHTCache";
+    public  static final String REMOTE_INDEXING  = "remoteIndexing";
+    private static final String CACHE_STRAGEGY   = "cacheStrategy";
+    public  static final String CRAWLER_URL_MUSTMATCH         = "crawlerURLMustMatch";
+    public  static final String CRAWLER_URL_MUSTNOTMATCH      = "crawlerURLMustNotMatch";
+    private static final String CRAWLER_IP_MUSTMATCH          = "crawlerIPMustMatch";
+    private static final String CRAWLER_IP_MUSTNOTMATCH       = "crawlerIPMustNotMatch";
+    private static final String CRAWLER_COUNTRY_MUSTMATCH     = "crawlerCountryMustMatch";
+    private static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch";
+    private static final String INDEXING_URL_MUSTMATCH        = "indexURLMustMatch";
+    private static final String INDEXING_URL_MUSTNOTMATCH     = "indexURLMustNotMatch";
+    private static final String COLLECTIONS = "collections";
 
     private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
     private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
     private Pattern crawlernodepthlimitmatch = null;
     private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
 
-    public final static class DomProfile {
-
-        public String referrer;
-        public int depth, count;
-
-        public DomProfile(final String ref, final int d) {
-            this.referrer = ref;
-            this.depth = d;
-            this.count = 1;
-        }
-
-        public void inc() {
-            this.count++;
-        }
-
-    }
-
-    private final Map<String, DomProfile> doms;
+    private final Map<String, AtomicInteger> doms;
 
     /**
      * Constructor which creates CrawlPofile from parameters.
@@ -156,7 +136,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
             throw new NullPointerException("name must not be null or empty");
         }
         if (name.length() > 256) name = name.substring(256);
-        this.doms = new ConcurrentHashMap<String, DomProfile>();
+        this.doms = new ConcurrentHashMap<String, AtomicInteger>();
         final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength);
         put(HANDLE,           handle);
         put(NAME,             name);
@@ -177,9 +157,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         put(INDEX_MEDIA,      indexMedia);
         put(STORE_HTCACHE,    storeHTCache);
         put(REMOTE_INDEXING,  remoteIndexing);
-        put(XSSTOPW,          xsstopw); // exclude static stop-words
-        put(XDSTOPW,          xdstopw); // exclude dynamic stop-word
-        put(XPSTOPW,          xpstopw); // exclude parent stop-words
         put(CACHE_STRAGEGY,   cacheStrategy.toString());
         put(COLLECTIONS,      collections.trim().replaceAll(" ", ""));
     }
@@ -191,25 +168,25 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     public CrawlProfile(final Map<String, String> ext) {
         super(ext == null ? 1 : ext.size());
         if (ext != null) putAll(ext);
-        this.doms = new ConcurrentHashMap<String, DomProfile>();
+        this.doms = new ConcurrentHashMap<String, AtomicInteger>();
     }
 
-    public void domInc(final String domain, final String referrer, final int depth) {
-        final DomProfile dp = this.doms.get(domain);
+    public void domInc(final String domain) {
+        final AtomicInteger dp = this.doms.get(domain);
         if (dp == null) {
             // new domain
-            this.doms.put(domain, new DomProfile(referrer, depth));
+            this.doms.put(domain, new AtomicInteger(1));
         } else {
             // increase counter
-            dp.inc();
+            dp.incrementAndGet();
         }
     }
 
-    public String domName(final boolean attr, final int index){
-        final Iterator<Map.Entry<String, DomProfile>> domnamesi = this.doms.entrySet().iterator();
+    private String domName(final boolean attr, final int index){
+        final Iterator<Map.Entry<String, AtomicInteger>> domnamesi = this.doms.entrySet().iterator();
         String domname="";
-        Map.Entry<String, DomProfile> ey;
-        DomProfile dp;
+        Map.Entry<String, AtomicInteger> ey;
+        AtomicInteger dp;
         int i = 0;
         while ((domnamesi.hasNext()) && (i < index)) {
             ey = domnamesi.next();
@@ -218,16 +195,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         if (domnamesi.hasNext()) {
             ey = domnamesi.next();
             dp = ey.getValue();
-            domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " ");
+            domname = ey.getKey() + ((attr) ? ("/c=" + dp.get()) : " ");
         }
         return domname;
     }
 
-    public void clearDoms() {
-        this.doms.clear();
-    }
-
-    public DomProfile getDom(final String domain) {
+    public AtomicInteger getCount(final String domain) {
         return this.doms.get(domain);
     }
 
@@ -245,7 +218,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
      * @param key name of the parameter
      * @param value values if the parameter
      */
-    public final void put(final String key, final int value) {
+    private final void put(final String key, final int value) {
         super.put(key, Integer.toString(value));
     }
 
@@ -254,7 +227,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
      * @param key name of the parameter
      * @param value values if the parameter
      */
-    public final void put(final String key, final long value) {
+    private final void put(final String key, final long value) {
         super.put(key, Long.toString(value));
     }
 
@@ -476,12 +449,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         return (r.equals(Boolean.TRUE.toString()));
     }
 
-    public boolean pushSolr() {
-        final String r = get(PUSH_SOLR);
-        if (r == null) return true;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
-
     public boolean indexText() {
         final String r = get(INDEX_TEXT);
         if (r == null) return true;
@@ -505,24 +472,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         return (r.equals(Boolean.TRUE.toString()));
     }
 
-    public boolean excludeStaticStopwords() {
-        final String r = get(XSSTOPW);
-        if (r == null) return false;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
-
-    public boolean excludeDynamicStopwords() {
-        final String r = get(XDSTOPW);
-        if (r == null) return false;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
-
-    public boolean excludeParentStopwords() {
-        final String r = get(XPSTOPW);
-        if (r == null) return false;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
-
     public static long getRecrawlDate(final long oldTimeMinutes) {
         return System.currentTimeMillis() - (60000L * oldTimeMinutes);
     }
@@ -535,7 +484,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(".*").toString();
     }
 
-    public static String mustMatchSubpath(final MultiProtocolURI uri) {
+    private static String mustMatchSubpath(final MultiProtocolURI uri) {
         String u = uri.toNormalform(true);
         if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);}
         return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java
index 6052a4be0..b78315c11 100644
--- a/source/net/yacy/crawler/data/NoticedURL.java
+++ b/source/net/yacy/crawler/data/NoticedURL.java
@@ -173,19 +173,14 @@ public class NoticedURL {
      * @param entry
      * @return null if this was successful or a String explaining what went wrong in case of an error
      */
-    public String push(final StackType stackType, final Request entry) {
+    public String push(final StackType stackType, final Request entry, final RobotsTxt robots) {
         try {
             switch (stackType) {
-                case LOCAL:
-                    return this.coreStack.push(entry);
-                case GLOBAL:
-                    return this.limitStack.push(entry);
-                case REMOTE:
-                    return this.remoteStack.push(entry);
-                case NOLOAD:
-                    return this.noloadStack.push(entry);
-                default:
-                    return "stack type unknown";
+                case LOCAL: return this.coreStack.push(entry, robots);
+                case GLOBAL: return this.limitStack.push(entry, robots);
+                case REMOTE: return this.remoteStack.push(entry, robots);
+                case NOLOAD: return this.noloadStack.push(entry, robots);
+                default: return "stack type unknown";
             }
         } catch (final Exception er) {
             Log.logException(er);
@@ -277,7 +272,7 @@ public class NoticedURL {
         try {
             final Request entry = pop(fromStack, false, cs, robots);
             if (entry != null) {
-                final String warning = push(toStack, entry);
+                final String warning = push(toStack, entry, robots);
                 if (warning != null) {
                     Log.logWarning("NoticedURL", "shift from " + fromStack + " to " + toStack + ": " + warning);
                 }
diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java
index b0817c2a3..aaeb188cd 100644
--- a/source/net/yacy/crawler/robots/RobotsTxt.java
+++ b/source/net/yacy/crawler/robots/RobotsTxt.java
@@ -234,6 +234,109 @@ public class RobotsTxt {
         return robotsTxt4Host;
     }
 
+    public void ensureExist(final MultiProtocolURI theURL, final Set<String> thisAgents, boolean concurrent) {
+        final String urlHostPort = getHostPort(theURL);
+        final BEncodedHeap robotsTable;
+        try {
+            robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
+        } catch (IOException e1) {
+            log.fatal("tables not available", e1);
+            return;
+        }
+        if (robotsTable == null || robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
+
+        if (concurrent)
+            new Thread() {public void run(){ensureExist(urlHostPort, robotsTable, thisAgents);}}.start();
+        else
+            ensureExist(urlHostPort, robotsTable, thisAgents);
+    }
+    
+    private void ensureExist(final String urlHostPort, BEncodedHeap robotsTable, final Set<String> thisAgents) {
+
+        // make or get a synchronization object
+        DomSync syncObj = RobotsTxt.this.syncObjects.get(urlHostPort);
+        if (syncObj == null) {
+            syncObj = new DomSync();
+            RobotsTxt.this.syncObjects.put(urlHostPort, syncObj);
+        }
+        // we can now synchronize for each host separately
+        synchronized (syncObj) {
+            if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
+
+            // generating the proper url to download the robots txt
+            DigestURI robotsURL = null;
+            try {
+                robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt");
+            } catch (final MalformedURLException e) {
+                log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
+                robotsURL = null;
+            }
+
+            Response response = null;
+            if (robotsURL != null) {
+                if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
+                Request request = new Request(robotsURL, null);
+                try {
+                    response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0);
+                } catch (IOException e) {
+                    response = null;
+                }
+            }
+
+            RobotsTxtEntry robotsTxt4Host = null;
+            if (response == null) {
+                // no robots.txt available, make an entry to prevent that the robots loading is done twice
+                // generate artificial entry
+                robotsTxt4Host = new RobotsTxtEntry(
+                        robotsURL,
+                        new ArrayList<String>(),
+                        new ArrayList<String>(),
+                        new Date(),
+                        new Date(),
+                        null,
+                        null,
+                        Integer.valueOf(0),
+                        null);
+
+                // store the data into the robots DB
+                final int sz = robotsTable.size();
+                addEntry(robotsTxt4Host);
+                if (robotsTable.size() <= sz) {
+                    log.fatal("new entry in robots.txt table failed, resetting database");
+                    try {clear();} catch (IOException e) {}
+                    addEntry(robotsTxt4Host);
+                }
+            } else {
+                final byte[] robotsTxt = response.getContent();
+                //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
+                RobotsTxtParser parserResult;
+                ArrayList<String> denyPath;
+                if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
+                    parserResult = new RobotsTxtParser(thisAgents);
+                    // create virtual deny path
+                    denyPath = new ArrayList<String>();
+                    denyPath.add("/");
+                } else {
+                    parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
+                    denyPath = parserResult.denyList();
+                }
+
+                // store the data into the robots DB
+                String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
+                robotsTxt4Host = addEntry(
+                        robotsURL,
+                        parserResult.allowList(),
+                        denyPath,
+                        new Date(),
+                        response.getResponseHeader().lastModified(),
+                        etag,
+                        parserResult.sitemap(),
+                        parserResult.crawlDelayMillis(),
+                        parserResult.agentName());
+            }
+        }
+    }
+    
     private RobotsTxtEntry addEntry(
     		final MultiProtocolURI theURL,
     		final ArrayList<String> allowPathList,
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 9c2f06b17..a0407fbe5 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -804,6 +804,7 @@ public final class Switchboard extends serverSwitch
         // initializing the stackCrawlThread
         this.crawlStacker =
             new CrawlStacker(
+                this.robots,
                 this.crawlQueues,
                 this.crawler,
                 this.index,
@@ -1318,6 +1319,7 @@ public final class Switchboard extends serverSwitch
 
             this.crawlStacker =
                 new CrawlStacker(
+                    this.robots,
                     this.crawlQueues,
                     this.crawler,
                     this.index,
@@ -2802,9 +2804,9 @@ public final class Switchboard extends serverSwitch
         }
         final String s;
         if (asglobal) {
-            s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request);
+            s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots);
         } else {
-            s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request);
+            s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots);
         }
 
         if (s != null) {