diff --git a/htroot/CrawlCheck_p.java b/htroot/CrawlCheck_p.java index a4f2052d9..3a7f4689a 100644 --- a/htroot/CrawlCheck_p.java +++ b/htroot/CrawlCheck_p.java @@ -84,19 +84,16 @@ public class CrawlCheck_p { // try to load the robots RobotsTxtEntry robotsEntry; boolean robotsAllowed = true; - try { - robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs()); - if (robotsEntry == null) { - prop.put("table_list_" + row + "_robots", "no robots"); - prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms"); - prop.put("table_list_" + row + "_sitemap", ""); - } else { - robotsAllowed = !robotsEntry.isDisallowed(u); - prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed")); - prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms"); - prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true)); - } - } catch (final IOException e) { + robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs()); + if (robotsEntry == null) { + prop.put("table_list_" + row + "_robots", "no robots"); + prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms"); + prop.put("table_list_" + row + "_sitemap", ""); + } else { + robotsAllowed = !robotsEntry.isDisallowed(u); + prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed")); + prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms"); + prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true)); } // try to load the url diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index 9c71e495a..b5c073c3c 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -148,13 +148,7 @@ public class getpageinfo { final DigestURI theURL = new DigestURI(url); // determine if crawling of the current URL is allowed - RobotsTxtEntry robotsEntry; - try { - robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs()); - } catch (final IOException e) { - robotsEntry = null; - Log.logException(e); - } + RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs()); prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo()); diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index 756571ba5..d3205a4d9 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -148,13 +148,7 @@ public class getpageinfo_p { final DigestURI theURL = new DigestURI(url); // determine if crawling of the current URL is allowed - RobotsTxtEntry robotsEntry; - try { - robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs()); - } catch (final IOException e) { - robotsEntry = null; - Log.logException(e); - } + RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs()); prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo()); diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index 7e8f9eeec..be3ea0f28 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -464,10 +464,13 @@ public class Balancer { rest = rest + 1000 * loops; loops = 0; } - if (rest > 0) {try {Thread.sleep(rest);} catch (final InterruptedException e) {}} - for (int i = 0; i < loops; i++) { - Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); - try {Thread.sleep(1000); } catch (final InterruptedException e) {} + synchronized(this) { + // must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough + if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}} + for (int i = 0; i < loops; i++) { + Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); + try {this.wait(1000); } catch (final InterruptedException e) {} + } } Latency.updateAfterSelection(crawlEntry.url(), robotsTime); } diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index f82269594..ae0749238 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -45,8 +45,7 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.kelondroException; -public final class CrawlSwitchboard -{ +public final class CrawlSwitchboard { public static final String CRAWL_PROFILE_PROXY = "proxy"; public static final String CRAWL_PROFILE_REMOTE = "remote"; diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index c63ecfbd4..5e1bcee60 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -316,7 +316,6 @@ public class CrawlQueues { * @param stats String for log prefixing * @return */ - @SuppressWarnings("unused") private void load(final Request urlEntry, final String stats, final String profileHandle) { final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle)); if (profile != null) { @@ -340,7 +339,16 @@ public class CrawlQueues { if (urlEntry == null || urlEntry.url() == null) { this.log.logInfo(stats + ": urlEntry = null"); } else { - new Loader(urlEntry); + if (!this.workers.containsKey(Integer.valueOf(urlEntry.hashCode()))) { + Loader loader = new Loader(urlEntry); + this.workers.put(loader.code, loader); + try { + loader.start(); + } catch (final OutOfMemoryError e) { + Log.logWarning("CrawlQueues", "crawlWorker sequential fail-over: " + e.getMessage()); + loader.run(); + } + } } } else { @@ -615,16 +623,7 @@ public class CrawlQueues { this.request = entry; this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED); this.code = Integer.valueOf(entry.hashCode()); - if (!CrawlQueues.this.workers.containsKey(this.code)) { - CrawlQueues.this.workers.put(this.code, this); - try { - start(); - } catch (final OutOfMemoryError e) { - Log.logWarning("CrawlQueues", "crawlWorker sequential fail-over: " + e.getMessage()); - run(); - } - } - setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse + this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse } public long age() { diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index 113ee976b..cde93d97b 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -23,7 +23,6 @@ package net.yacy.crawler.data; -import java.io.IOException; import java.util.Iterator; import java.util.Map; import java.util.Set; @@ -100,12 +99,7 @@ public class Latency { */ public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set thisAgents) { long robotsDelay = 0; - RobotsTxtEntry robotsEntry; - try { - robotsEntry = robots.getEntry(url, thisAgents); - } catch (final IOException e) { - robotsEntry = null; - } + RobotsTxtEntry robotsEntry = robots.getEntry(url, thisAgents); robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer return robotsDelay; diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index dcf6ed3aa..769cf1d65 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -201,7 +201,7 @@ public final class HTTPLoader { } // create a new cache entry - final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = request.profileHandle() == null ? null : this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); response = new Response( request, requestHeader, diff --git a/source/net/yacy/crawler/retrieval/Request.java b/source/net/yacy/crawler/retrieval/Request.java index 3559f2699..531531e55 100644 --- a/source/net/yacy/crawler/retrieval/Request.java +++ b/source/net/yacy/crawler/retrieval/Request.java @@ -292,9 +292,7 @@ public class Request extends WorkflowJob public String profileHandle() { // the handle of the crawl profile - assert this.profileHandle.length() == Word.commonHashLength : this.profileHandle - + " != " - + Word.commonHashLength; + assert this.profileHandle == null || this.profileHandle.length() == Word.commonHashLength : this.profileHandle + " != " + Word.commonHashLength; return this.profileHandle; } diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index c30db0801..b0817c2a3 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -36,17 +36,15 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; -import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.HeaderFramework; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.crawler.retrieval.HTTPLoader; +import net.yacy.crawler.retrieval.Request; +import net.yacy.crawler.retrieval.Response; import net.yacy.data.WorkTables; import net.yacy.kelondro.blob.BEncodedHeap; -import net.yacy.kelondro.io.ByteCount; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.repository.LoaderDispatcher; import org.apache.log4j.Logger; @@ -61,14 +59,16 @@ public class RobotsTxt { private final ConcurrentHashMap syncObjects; //private static final HashSet loadedRobots = new HashSet(); // only for debugging private final WorkTables tables; + private final LoaderDispatcher loader; private static class DomSync { private DomSync() {} } - public RobotsTxt(final WorkTables worktables) { + public RobotsTxt(final WorkTables worktables, LoaderDispatcher loader) { this.syncObjects = new ConcurrentHashMap(); this.tables = worktables; + this.loader = loader; try { this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME); //log.info("initiated robots table: " + this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).getFile()); @@ -90,23 +90,31 @@ public class RobotsTxt { return this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).size(); } - public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents) throws IOException { + public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents) { if (theURL == null) throw new IllegalArgumentException(); if (!theURL.getProtocol().startsWith("http")) return null; return getEntry(theURL, thisAgents, true); } - private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException { + private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) { // this method will always return a non-null value final String urlHostPort = getHostPort(theURL); RobotsTxtEntry robotsTxt4Host = null; Map record; - final BEncodedHeap robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME); + BEncodedHeap robotsTable = null; + try { + robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME); + } catch (IOException e1) { + log.fatal("tables not available", e1); + } try { record = robotsTable.get(robotsTable.encodedKey(urlHostPort)); } catch (final SpaceExceededException e) { log.warn("memory exhausted", e); record = null; + } catch (IOException e) { + log.warn("cannot get robotstxt from table", e); + record = null; } if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record); @@ -135,6 +143,9 @@ public class RobotsTxt { } catch (final SpaceExceededException e) { log.warn("memory exhausted", e); record = null; + } catch (IOException e) { + log.warn("cannot get robotstxt from table", e); + record = null; } if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record); if (robotsTxt4Host != null && @@ -144,32 +155,26 @@ public class RobotsTxt { } // generating the proper url to download the robots txt - MultiProtocolURI robotsURL = null; + DigestURI robotsURL = null; try { - robotsURL = new MultiProtocolURI("http://" + urlHostPort + "/robots.txt"); + robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt"); } catch (final MalformedURLException e) { log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); robotsURL = null; } - Object[] result = null; + Response response = null; if (robotsURL != null) { if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); + Request request = new Request(robotsURL, null); try { - result = downloadRobotsTxt(robotsURL, 3, robotsTxt4Host); - } catch (final Exception e) { - result = null; + response = this.loader.load(request, CacheStrategy.NOCACHE, null, 0); + } catch (IOException e) { + response = null; } } - /* - assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) : - "robots-url=" + robotsURL.toString() + - ", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : UTF8.String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) + - ", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString()); - loadedRobots.add(robotsURL.toNormalform(false, false)); - */ - - if (result == null) { + + if (response == null) { // no robots.txt available, make an entry to prevent that the robots loading is done twice if (robotsTxt4Host == null) { // generate artificial entry @@ -192,15 +197,15 @@ public class RobotsTxt { addEntry(robotsTxt4Host); if (robotsTable.size() <= sz) { log.fatal("new entry in robots.txt table failed, resetting database"); - clear(); + try {clear();} catch (IOException e) {} addEntry(robotsTxt4Host); } } else { - final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT]; + final byte[] robotsTxt = response.getContent(); //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove RobotsTxtParser parserResult; ArrayList denyPath; - if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) { + if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) { parserResult = new RobotsTxtParser(thisAgents); // create virtual deny path denyPath = new ArrayList(); @@ -211,13 +216,14 @@ public class RobotsTxt { } // store the data into the robots DB + String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null; robotsTxt4Host = addEntry( robotsURL, parserResult.allowList(), denyPath, new Date(), - (Date) result[DOWNLOAD_MODDATE], - (String) result[DOWNLOAD_ETAG], + response.getResponseHeader().lastModified(), + etag, parserResult.sitemap(), parserResult.crawlDelayMillis(), parserResult.agentName()); @@ -259,13 +265,6 @@ public class RobotsTxt { } } - // methods that had been in robotsParser.java: - - private static final int DOWNLOAD_ACCESS_RESTRICTED = 0; - static final int DOWNLOAD_ROBOTS_TXT = 1; - private static final int DOWNLOAD_ETAG = 2; - private static final int DOWNLOAD_MODDATE = 3; - static final String getHostPort(final MultiProtocolURI theURL) { final int port = getPort(theURL); String host = theURL.getHost(); @@ -287,131 +286,4 @@ public class RobotsTxt { return port; } - protected static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsTxtEntry entry) throws Exception { - if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null; - - if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null}; - redirectionCount--; - - boolean accessCompletelyRestricted = false; - byte[] robotsTxt = null; - long downloadStart, downloadEnd; - String eTag=null, oldEtag = null; - Date lastMod=null; - downloadStart = System.currentTimeMillis(); - - // if we previously have downloaded this robots.txt then we can set the if-modified-since header - RequestHeader reqHeaders = new RequestHeader(); - - // add yacybot user agent - reqHeaders.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent()); - - // adding referer - reqHeaders.put(RequestHeader.REFERER, (MultiProtocolURI.newURL(robotsURL,"/")).toNormalform(true)); - reqHeaders.put(HeaderFramework.ACCEPT, HTTPLoader.DEFAULT_ACCEPT); - if (entry != null) { - oldEtag = entry.getETag(); - reqHeaders = new RequestHeader(); - final Date modDate = entry.getModDate(); - if (modDate != null) reqHeaders.put(RequestHeader.IF_MODIFIED_SINCE, HeaderFramework.formatRFC1123(entry.getModDate())); - - } - - // setup http-client - //TODO: adding Traffic statistic for robots download? - final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT); - client.setHeader(reqHeaders.entrySet()); - try { - // check for interruption - if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress."); - - // sending the get request - robotsTxt = client.GETbytes(robotsURL); - // statistics: - if (robotsTxt != null) { - ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length); - } - final int code = client.getHttpResponse().getStatusLine().getStatusCode(); - final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); - - // check the response status - if (code > 199 && code < 300) { - if (!header.mime().startsWith("text/plain")) { - robotsTxt = null; - log.info("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + header.mime() + "'."); - } else { - - // getting some metadata - eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null; - lastMod = header.lastModified(); - - // if the robots.txt file was not changed we break here - if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) { - if (log.isDebugEnabled()) log.debug("Robots.txt from URL '" + robotsURL + "' was not modified. Abort downloading of new version."); - return null; - } - - - downloadEnd = System.currentTimeMillis(); - if (log.isDebugEnabled()) log.debug("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms."); - } - } else if (code == 304) { - return null; - } else if (code > 299 && code < 400) { - // getting redirection URL - String redirectionUrlString = header.get(HeaderFramework.LOCATION); - if (redirectionUrlString==null) { - if (log.isDebugEnabled()) - log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "]."); - robotsTxt = null; - } else { - - redirectionUrlString = redirectionUrlString.trim(); - - // generating the new URL object - final MultiProtocolURI redirectionUrl = MultiProtocolURI.newURL(robotsURL, redirectionUrlString); - - // following the redirection - if (log.isDebugEnabled()) log.debug("Redirection detected for robots.txt with URL '" + robotsURL + "'." + - "\nRedirecting request to: " + redirectionUrl); - return downloadRobotsTxt(redirectionUrl,redirectionCount,entry); - } - } else if (code == 401 || code == 403) { - accessCompletelyRestricted = true; - log.info("Access to Robots.txt not allowed on URL '" + robotsURL + "', redirectionCount = " + redirectionCount); // since this is a strange case we log it all the time - } else { - if (log.isDebugEnabled()) - log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "]."); - robotsTxt = null; - } - } catch (final Exception e) { - throw e; - } - return new Object[]{Boolean.valueOf(accessCompletelyRestricted),robotsTxt,eTag,lastMod}; - } - - public final static void main(final String[] args) throws Exception { - - final String url = "http://www.badelatschen.net/robots.txt"; - final Object[] o = downloadRobotsTxt(new MultiProtocolURI(url), 0, null); - if (o == null) { - System.out.println("result: null"); - } else { - System.out.println("not allowed = " + ((Boolean) o[0]).toString()); - System.out.println("robots = " + ((o[1] == null) ? "null" : UTF8.String((byte[]) o[1]))); - } - System.exit(0); -/* - final HttpClient httpclient = new DefaultHttpClient(); - try { - final HttpGet httpget = new HttpGet(url); - final ResponseHandler responseHandler = new BasicResponseHandler(); - final String responseBody = httpclient.execute(httpget, responseHandler); - System.out.println(responseBody); - } finally { - httpclient.getConnectionManager().shutdown(); - } - */ - } - } diff --git a/source/net/yacy/crawler/robots/RobotsTxtParser.java b/source/net/yacy/crawler/robots/RobotsTxtParser.java index 5f2368e5c..b002c30cc 100644 --- a/source/net/yacy/crawler/robots/RobotsTxtParser.java +++ b/source/net/yacy/crawler/robots/RobotsTxtParser.java @@ -35,15 +35,11 @@ import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; -import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.HashSet; import java.util.Set; import java.util.regex.Pattern; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; -import net.yacy.kelondro.data.meta.DigestURI; /* * A class for Parsing robots.txt files. @@ -100,35 +96,6 @@ public final class RobotsTxtParser { } } - - public static RobotsTxtParser getRobots(String homepage) { - DigestURI theURL; - try { - theURL = new DigestURI(homepage); - } catch (MalformedURLException e1) { - return null; - } - - final String urlHostPort = RobotsTxt.getHostPort(theURL); - MultiProtocolURI robotsURL = null; - try { - robotsURL = new MultiProtocolURI("http://" + urlHostPort + "/robots.txt"); - } catch (final MalformedURLException e) { - return null; - } - - Object[] result; - try { - result = RobotsTxt.downloadRobotsTxt(robotsURL, 0, null); - } catch (Exception e) { - return null; - } - - final byte[] robotsTxt = (byte[]) result[RobotsTxt.DOWNLOAD_ROBOTS_TXT]; - RobotsTxtParser parserResult = new RobotsTxtParser(new HashSet(), robotsTxt); - return parserResult; - } - private void parse(final BufferedReader reader) { final ArrayList deny4AllAgents = new ArrayList(); final ArrayList deny4ThisAgents = new ArrayList(); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 32a4b8e51..83c860682 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -194,8 +194,8 @@ public final class LoaderDispatcher { } // check if we have the page in the cache - final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle())); - if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) { + final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle())); + if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) { // we have passed a first test if caching is allowed // now see if there is a cache entry diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 67fc3d5af..9c2f06b17 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -596,11 +596,6 @@ public final class Switchboard extends serverSwitch }.start(); */ - // load the robots.txt db - this.log.logConfig("Initializing robots.txt DB"); - this.robots = new RobotsTxt(this.tables); - this.log.logConfig("Loaded robots.txt DB: " + this.robots.size() + " entries"); - // start a cache manager this.log.logConfig("Starting HT Cache Manager"); @@ -718,6 +713,13 @@ public final class Switchboard extends serverSwitch // start a loader this.log.logConfig("Starting Crawl Loader"); this.loader = new LoaderDispatcher(this); + + // load the robots.txt db + this.log.logConfig("Initializing robots.txt DB"); + this.robots = new RobotsTxt(this.tables, this.loader); + this.log.logConfig("Loaded robots.txt DB: " + this.robots.size() + " entries"); + + // load oai tables final Map oaiFriends = OAIListFriendsLoader.loadListFriendsSources( new File("defaults/oaiListFriendsSource.xml"),