diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 668599b8a..d3228267f 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -43,8 +43,10 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.CrawlSwitchboard; +import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.retrieval.SitemapImporter; +import net.yacy.crawler.robots.RobotsTxt; import net.yacy.data.WorkTables; import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; @@ -218,7 +220,10 @@ public class Crawler_p { if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr; // delete old robots entries - for (DigestURL ru: rootURLs) sb.robots.delete(ru); + for (DigestURL ru: rootURLs) { + sb.robots.delete(ru); + try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {} + } // set the crawl filter String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING); diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index e648f68f2..e2acd8b81 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -155,13 +155,7 @@ public class RobotsTxt { } // generating the proper url to download the robots txt - DigestURL robotsURL = null; - try { - robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); - } catch (final MalformedURLException e) { - log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); - robotsURL = null; - } + DigestURL robotsURL = robotsURL(urlHostPort); Response response = null; if (robotsURL != null) { @@ -230,14 +224,8 @@ public class RobotsTxt { if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return; // generating the proper url to download the robots txt - DigestURL robotsURL = null; - try { - robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); - } catch (final MalformedURLException e) { - log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); - robotsURL = null; - } - + DigestURL robotsURL = robotsURL(urlHostPort); + Response response = null; if (robotsURL != null) { if (log.isFine()) log.fine("Trying to download the robots.txt file from URL '" + robotsURL + "'."); @@ -332,7 +320,7 @@ public class RobotsTxt { } } - static final String getHostPort(final MultiProtocolURL theURL) { + public static final String getHostPort(final MultiProtocolURL theURL) { int port = theURL.getPort(); if (port == -1) { if (theURL.getProtocol().equalsIgnoreCase("http")) { @@ -349,7 +337,18 @@ public class RobotsTxt { sb.append(host).append(':').append(Integer.toString(port)); return sb.toString(); } - + + public static DigestURL robotsURL(final String urlHostPort) { + DigestURL robotsURL = null; + try { + robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); + } catch (final MalformedURLException e) { + log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); + robotsURL = null; + } + return robotsURL; + } + public static class CheckEntry { public final DigestURL digestURL; public final RobotsTxtEntry robotsTxtEntry;