also delete the robots.txt file from the cache when a new crawl is

started
pull/1/head
Michael Peter Christen 11 years ago
parent 1c21b3256d
commit 9e503b3376

@ -43,8 +43,10 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.SitemapImporter;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.data.WorkTables;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
@ -218,7 +220,10 @@ public class Crawler_p {
if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
// delete old robots entries
for (DigestURL ru: rootURLs) sb.robots.delete(ru);
for (DigestURL ru: rootURLs) {
sb.robots.delete(ru);
try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {}
}
// set the crawl filter
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);

@ -155,13 +155,7 @@ public class RobotsTxt {
}
// generating the proper url to download the robots txt
DigestURL robotsURL = null;
try {
robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
}
DigestURL robotsURL = robotsURL(urlHostPort);
Response response = null;
if (robotsURL != null) {
@ -230,14 +224,8 @@ public class RobotsTxt {
if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
// generating the proper url to download the robots txt
DigestURL robotsURL = null;
try {
robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
}
DigestURL robotsURL = robotsURL(urlHostPort);
Response response = null;
if (robotsURL != null) {
if (log.isFine()) log.fine("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
@ -332,7 +320,7 @@ public class RobotsTxt {
}
}
static final String getHostPort(final MultiProtocolURL theURL) {
public static final String getHostPort(final MultiProtocolURL theURL) {
int port = theURL.getPort();
if (port == -1) {
if (theURL.getProtocol().equalsIgnoreCase("http")) {
@ -349,7 +337,18 @@ public class RobotsTxt {
sb.append(host).append(':').append(Integer.toString(port));
return sb.toString();
}
public static DigestURL robotsURL(final String urlHostPort) {
DigestURL robotsURL = null;
try {
robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
}
return robotsURL;
}
public static class CheckEntry {
public final DigestURL digestURL;
public final RobotsTxtEntry robotsTxtEntry;

Loading…
Cancel
Save