- added concurrency for robots.txt loading

- changed data model for domain counter
13 years ago · c25d7bcb80
parent a94c537afc
commit c25d7bcb80
7 changed files with 172 additions and 125 deletions
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@ -75,9 +75,6 @@ public class CrawlProfileEditor_p {
        labels.add(new eentry(CrawlProfile.INDEX_MEDIA,         "Index Media",           false, eentry.BOOLEAN));
        labels.add(new eentry(CrawlProfile.STORE_HTCACHE,       "Store in HTCache",      false, eentry.BOOLEAN));
        labels.add(new eentry(CrawlProfile.REMOTE_INDEXING,     "Remote Indexing",       false, eentry.BOOLEAN));
-        labels.add(new eentry(CrawlProfile.XSSTOPW,             "Static stop-words",     false, eentry.BOOLEAN));
-        labels.add(new eentry(CrawlProfile.XDSTOPW,             "Dynamic stop-words",    false, eentry.BOOLEAN));
-        labels.add(new eentry(CrawlProfile.XPSTOPW,             "Parent stop-words",     false, eentry.BOOLEAN));
    }

    public static serverObjects respond(
--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@ -256,7 +256,7 @@ public class Balancer {
     * @throws IOException
     * @throws SpaceExceededException
     */
-    public String push(final Request entry) throws IOException, SpaceExceededException {
+    public String push(final Request entry, final RobotsTxt robots) throws IOException, SpaceExceededException {
        assert entry != null;
        final byte[] hash = entry.url().hash();
        synchronized (this) {
@ -275,8 +275,9 @@ public class Balancer {

 	        // add the hash to a queue
 	        pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
-	        return null;
        }
+        robots.ensureExist(entry.url(), Balancer.this.myAgentIDs, true); // concurrently load all robots.txt
+        return null;
    }

    /**
@ -319,8 +320,7 @@ public class Balancer {
     * @param crawlURL
     * @return
     */
-    private long getRobotsTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) {
-        if (profileEntry == null) return 0;
+    private long getRobotsTime(final RobotsTxt robots, final DigestURI crawlURL) {
        long sleeptime = Latency.waitingRobots(crawlURL, robots, this.myAgentIDs); // this uses the robots.txt database and may cause a loading of robots.txt from the server
        return sleeptime < 0 ? 0 : sleeptime;
    }
@ -450,7 +450,7 @@ public class Balancer {
    	}
    	if (crawlEntry == null) return null;

-    	long robotsTime = getRobotsTime(robots, profileEntry, crawlEntry.url());
+    	long robotsTime = getRobotsTime(robots, crawlEntry.url());
        Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
        if (delay && sleeptime > 0) {
            // force a busy waiting here
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -37,6 +37,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
 import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;

 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.Classification.ContentDomain;
@ -50,13 +51,13 @@ import net.yacy.crawler.data.CrawlQueues;
 import net.yacy.crawler.data.NoticedURL;
 import net.yacy.crawler.data.ResultURLs;
 import net.yacy.crawler.data.ZURL;
-import net.yacy.crawler.data.CrawlProfile.DomProfile;
 import net.yacy.crawler.data.ResultURLs.EventOrigin;
 import net.yacy.crawler.data.ZURL.FailCategory;
 import net.yacy.crawler.retrieval.FTPLoader;
 import net.yacy.crawler.retrieval.HTTPLoader;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.crawler.retrieval.SMBLoader;
+import net.yacy.crawler.robots.RobotsTxt;
 import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -75,7 +76,7 @@ public final class CrawlStacker {
    

    private final Log log = new Log("STACKCRAWL");
-
+    private final RobotsTxt robots;
    private final WorkflowProcessor<Request>  fastQueue, slowQueue;
    private final CrawlQueues       nextQueue;
    private final CrawlSwitchboard  crawler;
@ -87,6 +88,7 @@ public final class CrawlStacker {
    // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt

    public CrawlStacker(
+            final RobotsTxt robots,
            final CrawlQueues cq,
            final CrawlSwitchboard cs,
            final Segment indexSegment,
@ -94,6 +96,7 @@ public final class CrawlStacker {
            final boolean acceptLocalURLs,
            final boolean acceptGlobalURLs,
            final FilterEngine domainList) {
+        this.robots = robots;
        this.nextQueue = cq;
        this.crawler = cs;
        this.indexSegment = indexSegment;
@ -366,32 +369,30 @@ public final class CrawlStacker {
            entry.url().getContentDomain() == ContentDomain.AUDIO  ||
            entry.url().getContentDomain() == ContentDomain.VIDEO ||
            entry.url().getContentDomain() == ContentDomain.CTRL) {
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, this.robots);
            //if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
            return null;
        }
        
-        final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : this.nextQueue.getURL(entry.referrerhash());
-
        // add domain to profile domain list
        if (profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
-            profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
+            profile.domInc(entry.url().getHost());
        }

        if (global) {
            // it may be possible that global == true and local == true, so do not check an error case against it
            if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
            if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry, this.robots);
        } else if (local) {
            if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
            if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots);
        } else if (proxy) {
            if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots);
        } else if (remote) {
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry);
+            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry, this.robots);
        }
        if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed: " + warning);

@ -479,13 +480,13 @@ public final class CrawlStacker {
        // deny urls that exceed allowed number of occurrences
        final int maxAllowedPagesPerDomain = profile.domMaxPages();
        if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) {
-            final DomProfile dp = profile.getDom(url.getHost());
-            if (dp != null && dp.count >= maxAllowedPagesPerDomain) {
+            final AtomicInteger dp = profile.getCount(url.getHost());
+            if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
                if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed.");
                return "crawl stack domain counter exceeded";
            }

-            if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= profile.domMaxPages()) {
+            if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) {
                if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed.");
                return "result stack domain counter exceeded";
            }
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -31,6 +31,7 @@ import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;

@ -53,55 +54,34 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);

    // this is a simple record structure that hold all properties of a single crawl start
-    public static final String HANDLE           = "handle";
-    public static final String NAME             = "name";
-    public static final String DEPTH            = "generalDepth";
-    public static final String DIRECT_DOC_BY_URL= "directDocByURL";
-    public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
-    public static final String DOM_MAX_PAGES    = "domMaxPages";
-    public static final String CRAWLING_Q       = "crawlingQ";
-    public static final String PUSH_SOLR        = "pushSolr";
-    public static final String INDEX_TEXT       = "indexText";
-    public static final String INDEX_MEDIA      = "indexMedia";
-    public static final String STORE_HTCACHE    = "storeHTCache";
-    public static final String REMOTE_INDEXING  = "remoteIndexing";
-    public static final String XSSTOPW          = "xsstopw";
-    public static final String XDSTOPW          = "xdstopw";
-    public static final String XPSTOPW          = "xpstopw";
-    public static final String CACHE_STRAGEGY   = "cacheStrategy";
-    public static final String CRAWLER_URL_MUSTMATCH         = "crawlerURLMustMatch";
-    public static final String CRAWLER_URL_MUSTNOTMATCH      = "crawlerURLMustNotMatch";
-    public static final String CRAWLER_IP_MUSTMATCH          = "crawlerIPMustMatch";
-    public static final String CRAWLER_IP_MUSTNOTMATCH       = "crawlerIPMustNotMatch";
-    public static final String CRAWLER_COUNTRY_MUSTMATCH     = "crawlerCountryMustMatch";
-    public static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch";
-    public static final String INDEXING_URL_MUSTMATCH        = "indexURLMustMatch";
-    public static final String INDEXING_URL_MUSTNOTMATCH     = "indexURLMustNotMatch";
-    public static final String COLLECTIONS = "collections";
+    private static final String HANDLE           = "handle";
+    public  static final String NAME             = "name";
+    public  static final String DEPTH            = "generalDepth";
+    private static final String DIRECT_DOC_BY_URL= "directDocByURL";
+    public  static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
+    public  static final String DOM_MAX_PAGES    = "domMaxPages";
+    public  static final String CRAWLING_Q       = "crawlingQ";
+    public  static final String INDEX_TEXT       = "indexText";
+    public  static final String INDEX_MEDIA      = "indexMedia";
+    public  static final String STORE_HTCACHE    = "storeHTCache";
+    public  static final String REMOTE_INDEXING  = "remoteIndexing";
+    private static final String CACHE_STRAGEGY   = "cacheStrategy";
+    public  static final String CRAWLER_URL_MUSTMATCH         = "crawlerURLMustMatch";
+    public  static final String CRAWLER_URL_MUSTNOTMATCH      = "crawlerURLMustNotMatch";
+    private static final String CRAWLER_IP_MUSTMATCH          = "crawlerIPMustMatch";
+    private static final String CRAWLER_IP_MUSTNOTMATCH       = "crawlerIPMustNotMatch";
+    private static final String CRAWLER_COUNTRY_MUSTMATCH     = "crawlerCountryMustMatch";
+    private static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch";
+    private static final String INDEXING_URL_MUSTMATCH        = "indexURLMustMatch";
+    private static final String INDEXING_URL_MUSTNOTMATCH     = "indexURLMustNotMatch";
+    private static final String COLLECTIONS = "collections";

    private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
    private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
    private Pattern crawlernodepthlimitmatch = null;
    private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;

-    public final static class DomProfile {
-
-        public String referrer;
-        public int depth, count;
-
-        public DomProfile(final String ref, final int d) {
-            this.referrer = ref;
-            this.depth = d;
-            this.count = 1;
-        }
-
-        public void inc() {
-            this.count++;
-        }
-
-    }
-
-    private final Map<String, DomProfile> doms;
+    private final Map<String, AtomicInteger> doms;

    /**
     * Constructor which creates CrawlPofile from parameters.
@ -156,7 +136,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            throw new NullPointerException("name must not be null or empty");
        }
        if (name.length() > 256) name = name.substring(256);
-        this.doms = new ConcurrentHashMap<String, DomProfile>();
+        this.doms = new ConcurrentHashMap<String, AtomicInteger>();
        final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength);
        put(HANDLE,           handle);
        put(NAME,             name);
@ -177,9 +157,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(INDEX_MEDIA,      indexMedia);
        put(STORE_HTCACHE,    storeHTCache);
        put(REMOTE_INDEXING,  remoteIndexing);
-        put(XSSTOPW,          xsstopw); // exclude static stop-words
-        put(XDSTOPW,          xdstopw); // exclude dynamic stop-word
-        put(XPSTOPW,          xpstopw); // exclude parent stop-words
        put(CACHE_STRAGEGY,   cacheStrategy.toString());
        put(COLLECTIONS,      collections.trim().replaceAll(" ", ""));
    }
@ -191,25 +168,25 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public CrawlProfile(final Map<String, String> ext) {
        super(ext == null ? 1 : ext.size());
        if (ext != null) putAll(ext);
-        this.doms = new ConcurrentHashMap<String, DomProfile>();
+        this.doms = new ConcurrentHashMap<String, AtomicInteger>();
    }

-    public void domInc(final String domain, final String referrer, final int depth) {
-        final DomProfile dp = this.doms.get(domain);
+    public void domInc(final String domain) {
+        final AtomicInteger dp = this.doms.get(domain);
        if (dp == null) {
            // new domain
-            this.doms.put(domain, new DomProfile(referrer, depth));
+            this.doms.put(domain, new AtomicInteger(1));
        } else {
            // increase counter
-            dp.inc();
+            dp.incrementAndGet();
        }
    }

-    public String domName(final boolean attr, final int index){
-        final Iterator<Map.Entry<String, DomProfile>> domnamesi = this.doms.entrySet().iterator();
+    private String domName(final boolean attr, final int index){
+        final Iterator<Map.Entry<String, AtomicInteger>> domnamesi = this.doms.entrySet().iterator();
        String domname="";
-        Map.Entry<String, DomProfile> ey;
-        DomProfile dp;
+        Map.Entry<String, AtomicInteger> ey;
+        AtomicInteger dp;
        int i = 0;
        while ((domnamesi.hasNext()) && (i < index)) {
            ey = domnamesi.next();
@ -218,16 +195,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (domnamesi.hasNext()) {
            ey = domnamesi.next();
            dp = ey.getValue();
-            domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " ");
+            domname = ey.getKey() + ((attr) ? ("/c=" + dp.get()) : " ");
        }
        return domname;
    }

-    public void clearDoms() {
-        this.doms.clear();
-    }
-
-    public DomProfile getDom(final String domain) {
+    public AtomicInteger getCount(final String domain) {
        return this.doms.get(domain);
    }

@ -245,7 +218,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     * @param key name of the parameter
     * @param value values if the parameter
     */
-    public final void put(final String key, final int value) {
+    private final void put(final String key, final int value) {
        super.put(key, Integer.toString(value));
    }

@ -254,7 +227,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     * @param key name of the parameter
     * @param value values if the parameter
     */
-    public final void put(final String key, final long value) {
+    private final void put(final String key, final long value) {
        super.put(key, Long.toString(value));
    }

@ -476,12 +449,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return (r.equals(Boolean.TRUE.toString()));
    }

-    public boolean pushSolr() {
-        final String r = get(PUSH_SOLR);
-        if (r == null) return true;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
-
    public boolean indexText() {
        final String r = get(INDEX_TEXT);
        if (r == null) return true;
@ -505,24 +472,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return (r.equals(Boolean.TRUE.toString()));
    }

-    public boolean excludeStaticStopwords() {
-        final String r = get(XSSTOPW);
-        if (r == null) return false;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
-
-    public boolean excludeDynamicStopwords() {
-        final String r = get(XDSTOPW);
-        if (r == null) return false;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
-
-    public boolean excludeParentStopwords() {
-        final String r = get(XPSTOPW);
-        if (r == null) return false;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
-
    public static long getRecrawlDate(final long oldTimeMinutes) {
        return System.currentTimeMillis() - (60000L * oldTimeMinutes);
    }
@ -535,7 +484,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(".*").toString();
    }

-    public static String mustMatchSubpath(final MultiProtocolURI uri) {
+    private static String mustMatchSubpath(final MultiProtocolURI uri) {
        String u = uri.toNormalform(true);
        if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);}
        return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
--- a/source/net/yacy/crawler/data/NoticedURL.java
+++ b/source/net/yacy/crawler/data/NoticedURL.java
@ -173,19 +173,14 @@ public class NoticedURL {
     * @param entry
     * @return null if this was successful or a String explaining what went wrong in case of an error
     */
-    public String push(final StackType stackType, final Request entry) {
+    public String push(final StackType stackType, final Request entry, final RobotsTxt robots) {
        try {
            switch (stackType) {
-                case LOCAL:
-                    return this.coreStack.push(entry);
-                case GLOBAL:
-                    return this.limitStack.push(entry);
-                case REMOTE:
-                    return this.remoteStack.push(entry);
-                case NOLOAD:
-                    return this.noloadStack.push(entry);
-                default:
-                    return "stack type unknown";
+                case LOCAL: return this.coreStack.push(entry, robots);
+                case GLOBAL: return this.limitStack.push(entry, robots);
+                case REMOTE: return this.remoteStack.push(entry, robots);
+                case NOLOAD: return this.noloadStack.push(entry, robots);
+                default: return "stack type unknown";
            }
        } catch (final Exception er) {
            Log.logException(er);
@ -277,7 +272,7 @@ public class NoticedURL {
        try {
            final Request entry = pop(fromStack, false, cs, robots);
            if (entry != null) {
-                final String warning = push(toStack, entry);
+                final String warning = push(toStack, entry, robots);
                if (warning != null) {
                    Log.logWarning("NoticedURL", "shift from " + fromStack + " to " + toStack + ": " + warning);
                }
--- a/source/net/yacy/crawler/robots/RobotsTxt.java
+++ b/source/net/yacy/crawler/robots/RobotsTxt.java
@ -234,6 +234,109 @@ public class RobotsTxt {
        return robotsTxt4Host;
    }

+    public void ensureExist(final MultiProtocolURI theURL, final Set<String> thisAgents, boolean concurrent) {
+        final String urlHostPort = getHostPort(theURL);
+        final BEncodedHeap robotsTable;
+        try {
+            robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
+        } catch (IOException e1) {
+            log.fatal("tables not available", e1);
+            return;
+        }
+        if (robotsTable == null || robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
+
+        if (concurrent)
+            new Thread() {public void run(){ensureExist(urlHostPort, robotsTable, thisAgents);}}.start();
+        else
+            ensureExist(urlHostPort, robotsTable, thisAgents);
+    }
+    
+    private void ensureExist(final String urlHostPort, BEncodedHeap robotsTable, final Set<String> thisAgents) {
+
+        // make or get a synchronization object
+        DomSync syncObj = RobotsTxt.this.syncObjects.get(urlHostPort);
+        if (syncObj == null) {
+            syncObj = new DomSync();
+            RobotsTxt.this.syncObjects.put(urlHostPort, syncObj);
+        }
+        // we can now synchronize for each host separately
+        synchronized (syncObj) {
+            if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
+
+            // generating the proper url to download the robots txt
+            DigestURI robotsURL = null;
+            try {
+                robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt");
+            } catch (final MalformedURLException e) {
+                log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
+                robotsURL = null;
+            }
+
+            Response response = null;
+            if (robotsURL != null) {
+                if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
+                Request request = new Request(robotsURL, null);
+                try {
+                    response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0);
+                } catch (IOException e) {
+                    response = null;
+                }
+            }
+
+            RobotsTxtEntry robotsTxt4Host = null;
+            if (response == null) {
+                // no robots.txt available, make an entry to prevent that the robots loading is done twice
+                // generate artificial entry
+                robotsTxt4Host = new RobotsTxtEntry(
+                        robotsURL,
+                        new ArrayList<String>(),
+                        new ArrayList<String>(),
+                        new Date(),
+                        new Date(),
+                        null,
+                        null,
+                        Integer.valueOf(0),
+                        null);
+
+                // store the data into the robots DB
+                final int sz = robotsTable.size();
+                addEntry(robotsTxt4Host);
+                if (robotsTable.size() <= sz) {
+                    log.fatal("new entry in robots.txt table failed, resetting database");
+                    try {clear();} catch (IOException e) {}
+                    addEntry(robotsTxt4Host);
+                }
+            } else {
+                final byte[] robotsTxt = response.getContent();
+                //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
+                RobotsTxtParser parserResult;
+                ArrayList<String> denyPath;
+                if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
+                    parserResult = new RobotsTxtParser(thisAgents);
+                    // create virtual deny path
+                    denyPath = new ArrayList<String>();
+                    denyPath.add("/");
+                } else {
+                    parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
+                    denyPath = parserResult.denyList();
+                }
+
+                // store the data into the robots DB
+                String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
+                robotsTxt4Host = addEntry(
+                        robotsURL,
+                        parserResult.allowList(),
+                        denyPath,
+                        new Date(),
+                        response.getResponseHeader().lastModified(),
+                        etag,
+                        parserResult.sitemap(),
+                        parserResult.crawlDelayMillis(),
+                        parserResult.agentName());
+            }
+        }
+    }
+    
    private RobotsTxtEntry addEntry(
    		final MultiProtocolURI theURL,
    		final ArrayList<String> allowPathList,
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -804,6 +804,7 @@ public final class Switchboard extends serverSwitch
        // initializing the stackCrawlThread
        this.crawlStacker =
            new CrawlStacker(
+                this.robots,
                this.crawlQueues,
                this.crawler,
                this.index,
@ -1318,6 +1319,7 @@ public final class Switchboard extends serverSwitch

            this.crawlStacker =
                new CrawlStacker(
+                    this.robots,
                    this.crawlQueues,
                    this.crawler,
                    this.index,
@ -2802,9 +2804,9 @@ public final class Switchboard extends serverSwitch
        }
        final String s;
        if (asglobal) {
-            s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request);
+            s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots);
        } else {
-            s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request);
+            s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots);
        }

        if (s != null) {