do not store failure of loading of robots.txt into the index as a fail

document
pull/1/head
Michael Peter Christen 11 years ago
parent 05d58e4df0
commit 001e05bb80

@ -56,6 +56,7 @@ public class RobotsTxt {
private final static ConcurrentLog log = new ConcurrentLog(RobotsTxt.class.getName());
protected static final String ROBOTS_TXT_PATH = "/robots.txt";
protected static final String ROBOTS_DB_PATH_SEPARATOR = ";";
protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR);
@ -338,6 +339,10 @@ public class RobotsTxt {
return sb.toString();
}
public static boolean isRobotsURL(MultiProtocolURL url) {
return url.getPath().equals(ROBOTS_TXT_PATH);
}
/**
* generate a robots.txt url.
* @param urlHostPort a string of the form <host>':'<port> or just <host>
@ -347,7 +352,7 @@ public class RobotsTxt {
if (urlHostPort.endsWith(":80")) urlHostPort = urlHostPort.substring(0, urlHostPort.length() - 3);
DigestURL robotsURL = null;
try {
robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + ROBOTS_TXT_PATH);
} catch (final MalformedURLException e) {
log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;

@ -41,6 +41,7 @@ import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
@ -109,7 +110,7 @@ public class ErrorCache {
url, profile == null ? null : profile.collections(),
failCategory.name() + " " + reason, failCategory.failType,
httpcode, crawldepth);
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
if (this.fulltext.getDefaultConnector() != null && failCategory.store && !RobotsTxt.isRobotsURL(url)) {
// send the error to solr
try {
// do not overwrite error reports with error reports
@ -123,13 +124,9 @@ public class ErrorCache {
} catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
}
synchronized (this.cache) {
this.cache.put(ASCII.String(url.hash()), null);
}
} else {
synchronized (this.cache) {
this.cache.put(ASCII.String(url.hash()), failDoc);
}
}
synchronized (this.cache) {
this.cache.put(ASCII.String(url.hash()), failDoc);
}
checkStackSize();
}

Loading…
Cancel
Save