replaced the custom robots.txt loader by the standard http loader

13 years ago · 2d9e577ad0
parent 799d71bc67
commit 2d9e577ad0
13 changed files with 80 additions and 261 deletions
--- a/htroot/CrawlCheck_p.java
+++ b/htroot/CrawlCheck_p.java
@ -84,19 +84,16 @@ public class CrawlCheck_p {
                    // try to load the robots
                    RobotsTxtEntry robotsEntry;
                    boolean robotsAllowed = true;
-                    try {
-                        robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
-                        if (robotsEntry == null) {
-                            prop.put("table_list_" + row + "_robots", "no robots");
-                            prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms");
-                            prop.put("table_list_" + row + "_sitemap", "");
-                        } else {
-                            robotsAllowed = !robotsEntry.isDisallowed(u);
-                            prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
-                            prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms");
-                            prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
-                        }                        
-                    } catch (final IOException e) {
+                    robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
+                    if (robotsEntry == null) {
+                        prop.put("table_list_" + row + "_robots", "no robots");
+                        prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms");
+                        prop.put("table_list_" + row + "_sitemap", "");
+                    } else {
+                        robotsAllowed = !robotsEntry.isDisallowed(u);
+                        prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
+                        prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms");
+                        prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
                    }
                    
                    // try to load the url
--- a/htroot/api/getpageinfo.java
+++ b/htroot/api/getpageinfo.java
@ -148,13 +148,7 @@ public class getpageinfo {
                    final DigestURI theURL = new DigestURI(url);

                	// determine if crawling of the current URL is allowed
-                    RobotsTxtEntry robotsEntry;
-                    try {
-                        robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
-                    } catch (final IOException e) {
-                        robotsEntry = null;
-                        Log.logException(e);
-                    }
+                    RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
                	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
                    prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@ -148,13 +148,7 @@ public class getpageinfo_p {
                    final DigestURI theURL = new DigestURI(url);

                	// determine if crawling of the current URL is allowed
-                    RobotsTxtEntry robotsEntry;
-                    try {
-                        robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
-                    } catch (final IOException e) {
-                        robotsEntry = null;
-                        Log.logException(e);
-                    }
+                    RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
                	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
                    prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@ -464,10 +464,13 @@ public class Balancer {
            	rest = rest + 1000 * loops;
            	loops = 0;
            }
-            if (rest > 0) {try {Thread.sleep(rest);} catch (final InterruptedException e) {}}
-            for (int i = 0; i < loops; i++) {
-            	Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
-                try {Thread.sleep(1000); } catch (final InterruptedException e) {}
+            synchronized(this) {
+                // must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough
+                if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}}
+                for (int i = 0; i < loops; i++) {
+                	Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
+                	try {this.wait(1000); } catch (final InterruptedException e) {}
+                }
            }
            Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
        }
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -45,8 +45,7 @@ import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.kelondroException;

-public final class CrawlSwitchboard
-{
+public final class CrawlSwitchboard {

    public static final String CRAWL_PROFILE_PROXY = "proxy";
    public static final String CRAWL_PROFILE_REMOTE = "remote";
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@ -316,7 +316,6 @@ public class CrawlQueues {
     * @param stats String for log prefixing
     * @return
     */
-    @SuppressWarnings("unused")
    private void load(final Request urlEntry, final String stats, final String profileHandle) {
        final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle));
        if (profile != null) {
@ -340,7 +339,16 @@ public class CrawlQueues {
                if (urlEntry == null || urlEntry.url() == null) {
                    this.log.logInfo(stats + ": urlEntry = null");
                } else {
-                	new Loader(urlEntry);
+                    if (!this.workers.containsKey(Integer.valueOf(urlEntry.hashCode()))) {
+                        Loader loader = new Loader(urlEntry);
+                        this.workers.put(loader.code, loader);
+                        try {
+                            loader.start();
+                        } catch (final OutOfMemoryError e) {
+                            Log.logWarning("CrawlQueues", "crawlWorker sequential fail-over: " + e.getMessage());
+                            loader.run();
+                        }
+                    }
                }

            } else {
@ -615,16 +623,7 @@ public class CrawlQueues {
            this.request = entry;
            this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
            this.code = Integer.valueOf(entry.hashCode());
-            if (!CrawlQueues.this.workers.containsKey(this.code)) {
-                CrawlQueues.this.workers.put(this.code, this);
-                try {
-                    start();
-                } catch (final OutOfMemoryError e) {
-                    Log.logWarning("CrawlQueues", "crawlWorker sequential fail-over: " + e.getMessage());
-                    run();
-                }
-            }
-            setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
+            this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
        }

        public long age() {
--- a/source/net/yacy/crawler/data/Latency.java
+++ b/source/net/yacy/crawler/data/Latency.java
@ -23,7 +23,6 @@

 package net.yacy.crawler.data;

-import java.io.IOException;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
@ -100,12 +99,7 @@ public class Latency {
     */
    public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
        long robotsDelay = 0;
-        RobotsTxtEntry robotsEntry;
-        try {
-            robotsEntry = robots.getEntry(url, thisAgents);
-        } catch (final IOException e) {
-            robotsEntry = null;
-        }
+        RobotsTxtEntry robotsEntry = robots.getEntry(url, thisAgents);
        robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
        if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
        return robotsDelay;
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -201,7 +201,7 @@ public final class HTTPLoader {
            }

            // create a new cache entry
-            final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+            final CrawlProfile profile = request.profileHandle() == null ? null : this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
            response = new Response(
                    request,
                    requestHeader,
--- a/source/net/yacy/crawler/retrieval/Request.java
+++ b/source/net/yacy/crawler/retrieval/Request.java
@ -292,9 +292,7 @@ public class Request extends WorkflowJob

    public String profileHandle() {
        // the handle of the crawl profile
-        assert this.profileHandle.length() == Word.commonHashLength : this.profileHandle
-            + " != "
-            + Word.commonHashLength;
+        assert this.profileHandle == null || this.profileHandle.length() == Word.commonHashLength : this.profileHandle + " != " + Word.commonHashLength;
        return this.profileHandle;
    }

--- a/source/net/yacy/crawler/robots/RobotsTxt.java
+++ b/source/net/yacy/crawler/robots/RobotsTxt.java
@ -36,17 +36,15 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;

 import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.cora.document.UTF8;
-import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.protocol.HeaderFramework;
-import net.yacy.cora.protocol.RequestHeader;
-import net.yacy.cora.protocol.ResponseHeader;
-import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.cora.util.SpaceExceededException;
-import net.yacy.crawler.retrieval.HTTPLoader;
+import net.yacy.crawler.retrieval.Request;
+import net.yacy.crawler.retrieval.Response;
 import net.yacy.data.WorkTables;
 import net.yacy.kelondro.blob.BEncodedHeap;
-import net.yacy.kelondro.io.ByteCount;
+import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.repository.LoaderDispatcher;

 import org.apache.log4j.Logger;

@ -61,14 +59,16 @@ public class RobotsTxt {
    private final ConcurrentHashMap<String, DomSync> syncObjects;
    //private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
    private final WorkTables tables;
+    private final LoaderDispatcher loader;

    private static class DomSync {
    	private DomSync() {}
    }

-    public RobotsTxt(final WorkTables worktables) {
+    public RobotsTxt(final WorkTables worktables, LoaderDispatcher loader) {
        this.syncObjects = new ConcurrentHashMap<String, DomSync>();
        this.tables = worktables;
+        this.loader = loader;
        try {
            this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
            //log.info("initiated robots table: " + this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).getFile());
@ -90,23 +90,31 @@ public class RobotsTxt {
        return this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).size();
    }

-    public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
+    public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) {
        if (theURL == null) throw new IllegalArgumentException();
        if (!theURL.getProtocol().startsWith("http")) return null;
        return getEntry(theURL, thisAgents, true);
    }

-    private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
+    private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
            // this method will always return a non-null value
        final String urlHostPort = getHostPort(theURL);
        RobotsTxtEntry robotsTxt4Host = null;
        Map<String, byte[]> record;
-        final BEncodedHeap robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
+        BEncodedHeap robotsTable = null;
+        try {
+            robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
+        } catch (IOException e1) {
+            log.fatal("tables not available", e1);
+        }
        try {
            record = robotsTable.get(robotsTable.encodedKey(urlHostPort));
        } catch (final SpaceExceededException e) {
            log.warn("memory exhausted", e);
            record = null;
+        } catch (IOException e) {
+            log.warn("cannot get robotstxt from table", e);
+            record = null;
        }
        if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);

@ -135,6 +143,9 @@ public class RobotsTxt {
                } catch (final SpaceExceededException e) {
                    log.warn("memory exhausted", e);
                    record = null;
+                } catch (IOException e) {
+                    log.warn("cannot get robotstxt from table", e);
+                    record = null;
                }
                if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
                if (robotsTxt4Host != null &&
@ -144,32 +155,26 @@ public class RobotsTxt {
                }

                // generating the proper url to download the robots txt
-                MultiProtocolURI robotsURL = null;
+                DigestURI robotsURL = null;
                try {
-                    robotsURL = new MultiProtocolURI("http://" + urlHostPort + "/robots.txt");
+                    robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt");
                } catch (final MalformedURLException e) {
                    log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
                    robotsURL = null;
                }

-                Object[] result = null;
+                Response response = null;
                if (robotsURL != null) {
                    if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
+                    Request request = new Request(robotsURL, null);
                    try {
-                        result = downloadRobotsTxt(robotsURL, 3, robotsTxt4Host);
-                    } catch (final Exception e) {
-                        result = null;
+                        response = this.loader.load(request, CacheStrategy.NOCACHE, null, 0);
+                    } catch (IOException e) {
+                        response = null;
                    }
                }
-                /*
-                assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) :
-                    "robots-url=" + robotsURL.toString() +
-                    ", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : UTF8.String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) +
-                    ", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString());
-                loadedRobots.add(robotsURL.toNormalform(false, false));
-                */
-
-                if (result == null) {
+
+                if (response == null) {
                    // no robots.txt available, make an entry to prevent that the robots loading is done twice
                    if (robotsTxt4Host == null) {
                        // generate artificial entry
@ -192,15 +197,15 @@ public class RobotsTxt {
                    addEntry(robotsTxt4Host);
                    if (robotsTable.size() <= sz) {
                    	log.fatal("new entry in robots.txt table failed, resetting database");
-                    	clear();
+                    	try {clear();} catch (IOException e) {}
                    	addEntry(robotsTxt4Host);
                    }
                } else {
-                    final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT];
+                    final byte[] robotsTxt = response.getContent();
                    //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
                    RobotsTxtParser parserResult;
                    ArrayList<String> denyPath;
-                    if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
+                    if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
                        parserResult = new RobotsTxtParser(thisAgents);
                        // create virtual deny path
                        denyPath = new ArrayList<String>();
@ -211,13 +216,14 @@ public class RobotsTxt {
                    }

                    // store the data into the robots DB
+                    String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
                    robotsTxt4Host = addEntry(
                            robotsURL,
                            parserResult.allowList(),
                            denyPath,
                            new Date(),
-                            (Date) result[DOWNLOAD_MODDATE],
-                            (String) result[DOWNLOAD_ETAG],
+                            response.getResponseHeader().lastModified(),
+                            etag,
                            parserResult.sitemap(),
                            parserResult.crawlDelayMillis(),
                            parserResult.agentName());
@ -259,13 +265,6 @@ public class RobotsTxt {
        }
    }

-    // methods that had been in robotsParser.java:
-
-    private static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
-    static final int DOWNLOAD_ROBOTS_TXT = 1;
-    private static final int DOWNLOAD_ETAG = 2;
-    private static final int DOWNLOAD_MODDATE = 3;
-
    static final String getHostPort(final MultiProtocolURI theURL) {
        final int port = getPort(theURL);
        String host = theURL.getHost();
@ -287,131 +286,4 @@ public class RobotsTxt {
        return port;
    }

-    protected static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsTxtEntry entry) throws Exception {
-        if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null;
-
-        if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};
-        redirectionCount--;
-
-        boolean accessCompletelyRestricted = false;
-        byte[] robotsTxt = null;
-        long downloadStart, downloadEnd;
-        String eTag=null, oldEtag = null;
-        Date lastMod=null;
-        downloadStart = System.currentTimeMillis();
-
-        // if we previously have downloaded this robots.txt then we can set the if-modified-since header
-        RequestHeader reqHeaders = new RequestHeader();
-
-        // add yacybot user agent
-        reqHeaders.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
-
-        // adding referer
-        reqHeaders.put(RequestHeader.REFERER, (MultiProtocolURI.newURL(robotsURL,"/")).toNormalform(true));
-        reqHeaders.put(HeaderFramework.ACCEPT, HTTPLoader.DEFAULT_ACCEPT);
-        if (entry != null) {
-            oldEtag = entry.getETag();
-            reqHeaders = new RequestHeader();
-            final Date modDate = entry.getModDate();
-            if (modDate != null) reqHeaders.put(RequestHeader.IF_MODIFIED_SINCE, HeaderFramework.formatRFC1123(entry.getModDate()));
-
-        }
-
-        // setup http-client
-        //TODO: adding Traffic statistic for robots download?
-        final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
-        client.setHeader(reqHeaders.entrySet());
-        try {
-            // check for interruption
-            if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress.");
-
-            // sending the get request
-            robotsTxt = client.GETbytes(robotsURL);
-            // statistics:
-            if (robotsTxt != null) {
-            	ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length);
-            }
-            final int code = client.getHttpResponse().getStatusLine().getStatusCode();
-            final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
-
-            // check the response status
-            if (code > 199 && code < 300) {
-            	if (!header.mime().startsWith("text/plain")) {
-                    robotsTxt = null;
-                    log.info("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + header.mime() + "'.");
-                } else {
-
-                    // getting some metadata
-                	eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null;
-                    lastMod = header.lastModified();
-
-                    // if the robots.txt file was not changed we break here
-                    if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) {
-                        if (log.isDebugEnabled()) log.debug("Robots.txt from URL '" + robotsURL + "' was not modified. Abort downloading of new version.");
-                        return null;
-                    }
-
-
-                    downloadEnd = System.currentTimeMillis();
-                    if (log.isDebugEnabled()) log.debug("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms.");
-                }
-            } else if (code == 304) {
-                return null;
-            } else if (code > 299 && code < 400) {
-                // getting redirection URL
-            	String redirectionUrlString = header.get(HeaderFramework.LOCATION);
-                if (redirectionUrlString==null) {
-                    if (log.isDebugEnabled())
-                		log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "].");
-                    robotsTxt = null;
-                } else {
-
-                    redirectionUrlString = redirectionUrlString.trim();
-
-                    // generating the new URL object
-                    final MultiProtocolURI redirectionUrl = MultiProtocolURI.newURL(robotsURL, redirectionUrlString);
-
-                    // following the redirection
-                    if (log.isDebugEnabled()) log.debug("Redirection detected for robots.txt with URL '" + robotsURL + "'." +
-                            "\nRedirecting request to: " + redirectionUrl);
-                    return downloadRobotsTxt(redirectionUrl,redirectionCount,entry);
-                }
-            } else if (code == 401 || code == 403) {
-                accessCompletelyRestricted = true;
-                log.info("Access to Robots.txt not allowed on URL '" + robotsURL + "', redirectionCount = " + redirectionCount); // since this is a strange case we log it all the time
-            } else {
-            	if (log.isDebugEnabled())
-            		log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "].");
-                robotsTxt = null;
-            }
-        } catch (final Exception e) {
-            throw e;
-        }
-        return new Object[]{Boolean.valueOf(accessCompletelyRestricted),robotsTxt,eTag,lastMod};
-    }
-
-    public final static void main(final String[] args) throws Exception {
-
-        final String url = "http://www.badelatschen.net/robots.txt";
-        final Object[] o = downloadRobotsTxt(new MultiProtocolURI(url), 0, null);
-        if (o == null) {
-            System.out.println("result: null");
-        } else {
-            System.out.println("not allowed = " + ((Boolean) o[0]).toString());
-            System.out.println("robots = " + ((o[1] == null) ? "null" : UTF8.String((byte[]) o[1])));
-        }
-        System.exit(0);
-/*
-        final HttpClient httpclient = new DefaultHttpClient();
-        try {
-            final HttpGet httpget = new HttpGet(url);
-            final ResponseHandler<String> responseHandler = new BasicResponseHandler();
-            final String responseBody = httpclient.execute(httpget, responseHandler);
-            System.out.println(responseBody);
-        } finally {
-            httpclient.getConnectionManager().shutdown();
-        }
-        */
-    }
-
 }
--- a/source/net/yacy/crawler/robots/RobotsTxtParser.java
+++ b/source/net/yacy/crawler/robots/RobotsTxtParser.java
@ -35,15 +35,11 @@ import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.net.MalformedURLException;
 import java.util.ArrayList;
-import java.util.HashSet;
 import java.util.Set;
 import java.util.regex.Pattern;

-import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.document.UTF8;
-import net.yacy.kelondro.data.meta.DigestURI;

 /*
 * A class for Parsing robots.txt files.
@ -100,35 +96,6 @@ public final class RobotsTxtParser {
        }
    }

-
-    public static RobotsTxtParser getRobots(String homepage) {
-        DigestURI theURL;
-        try {
-            theURL = new DigestURI(homepage);
-        } catch (MalformedURLException e1) {
-            return null;
-        }
-
-        final String urlHostPort = RobotsTxt.getHostPort(theURL);
-        MultiProtocolURI robotsURL = null;
-        try {
-            robotsURL = new MultiProtocolURI("http://" + urlHostPort + "/robots.txt");
-        } catch (final MalformedURLException e) {
-            return null;
-        }
-
-        Object[] result;
-        try {
-            result = RobotsTxt.downloadRobotsTxt(robotsURL, 0, null);
-        } catch (Exception e) {
-            return null;
-        }
-
-        final byte[] robotsTxt = (byte[]) result[RobotsTxt.DOWNLOAD_ROBOTS_TXT];
-        RobotsTxtParser parserResult = new RobotsTxtParser(new HashSet<String>(), robotsTxt);
-        return parserResult;
-    }
-
    private void parse(final BufferedReader reader) {
        final ArrayList<String> deny4AllAgents = new ArrayList<String>();
        final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -194,8 +194,8 @@ public final class LoaderDispatcher {
        }

        // check if we have the page in the cache
-        final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
-        if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
+        final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
+        if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) {
            // we have passed a first test if caching is allowed
            // now see if there is a cache entry

--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -596,11 +596,6 @@ public final class Switchboard extends serverSwitch
        }.start();
        */

-        // load the robots.txt db
-        this.log.logConfig("Initializing robots.txt DB");
-        this.robots = new RobotsTxt(this.tables);
-        this.log.logConfig("Loaded robots.txt DB: " + this.robots.size() + " entries");
-
        // start a cache manager
        this.log.logConfig("Starting HT Cache Manager");

@ -718,6 +713,13 @@ public final class Switchboard extends serverSwitch
        // start a loader
        this.log.logConfig("Starting Crawl Loader");
        this.loader = new LoaderDispatcher(this);
+        
+        // load the robots.txt db
+        this.log.logConfig("Initializing robots.txt DB");
+        this.robots = new RobotsTxt(this.tables, this.loader);
+        this.log.logConfig("Loaded robots.txt DB: " + this.robots.size() + " entries");
+
+        // load oai tables
        final Map<String, File> oaiFriends =
            OAIListFriendsLoader.loadListFriendsSources(
                new File("defaults/oaiListFriendsSource.xml"),