From 001e05bb80bc00ebd85cde344fd4b931b25df182 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 1 Aug 2014 12:15:14 +0200 Subject: [PATCH] do not store failure of loading of robots.txt into the index as a fail document --- source/net/yacy/crawler/robots/RobotsTxt.java | 7 ++++++- source/net/yacy/search/index/ErrorCache.java | 13 +++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index 06d3a536b..4e88dd6eb 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -56,6 +56,7 @@ public class RobotsTxt { private final static ConcurrentLog log = new ConcurrentLog(RobotsTxt.class.getName()); + protected static final String ROBOTS_TXT_PATH = "/robots.txt"; protected static final String ROBOTS_DB_PATH_SEPARATOR = ";"; protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR); @@ -338,6 +339,10 @@ public class RobotsTxt { return sb.toString(); } + public static boolean isRobotsURL(MultiProtocolURL url) { + return url.getPath().equals(ROBOTS_TXT_PATH); + } + /** * generate a robots.txt url. * @param urlHostPort a string of the form ':' or just @@ -347,7 +352,7 @@ public class RobotsTxt { if (urlHostPort.endsWith(":80")) urlHostPort = urlHostPort.substring(0, urlHostPort.length() - 3); DigestURL robotsURL = null; try { - robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); + robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + ROBOTS_TXT_PATH); } catch (final MalformedURLException e) { log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); robotsURL = null; diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 85235123a..2611ef564 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -41,6 +41,7 @@ import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; +import net.yacy.crawler.robots.RobotsTxt; import net.yacy.search.index.Fulltext; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; @@ -109,7 +110,7 @@ public class ErrorCache { url, profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode, crawldepth); - if (this.fulltext.getDefaultConnector() != null && failCategory.store) { + if (this.fulltext.getDefaultConnector() != null && failCategory.store && !RobotsTxt.isRobotsURL(url)) { // send the error to solr try { // do not overwrite error reports with error reports @@ -123,13 +124,9 @@ public class ErrorCache { } catch (final IOException e) { ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage()); } - synchronized (this.cache) { - this.cache.put(ASCII.String(url.hash()), null); - } - } else { - synchronized (this.cache) { - this.cache.put(ASCII.String(url.hash()), failDoc); - } + } + synchronized (this.cache) { + this.cache.put(ASCII.String(url.hash()), failDoc); } checkStackSize(); }