diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 97aa39c85..668599b8a 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -217,6 +217,9 @@ public class Crawler_p { if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1); if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr; + // delete old robots entries + for (DigestURL ru: rootURLs) sb.robots.delete(ru); + // set the crawl filter String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING); final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING); diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index 45bca6431..e648f68f2 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -185,7 +185,24 @@ public class RobotsTxt { return robotsTxt4Host; } - + + public void delete(final MultiProtocolURL theURL) { + final String urlHostPort = getHostPort(theURL); + if (urlHostPort == null) return; + final BEncodedHeap robotsTable; + try { + robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME); + } catch (final IOException e1) { + log.severe("tables not available", e1); + return; + } + if (robotsTable == null) return; + try { + robotsTable.delete(robotsTable.encodedKey(urlHostPort)); + } catch (IOException e) { + } + } + public void ensureExist(final MultiProtocolURL theURL, final ClientIdentification.Agent agent, boolean concurrent) { if (theURL.isLocal()) return; final String urlHostPort = getHostPort(theURL); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 2e6e09650..a4812096c 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1492,7 +1492,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Collection c = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName()); if (c != null) for (Object cn: c) if (cn != null) this.collections.put((String) cn, QueryParams.catchall_pattern); this.failReason = (String) doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); - this.failType = FailType.valueOf((String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName())); + String fts = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); + if (fts == null) ConcurrentLog.warn("CollectionConfiguration", "no fail type given for URL " + this.digestURL.toNormalform(true)); + this.failType = fts == null ? FailType.fail : FailType.valueOf(fts); this.httpstatus = (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()); this.failtime = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); }