From d47afe6fabc707919620f478334150b673933482 Mon Sep 17 00:00:00 2001 From: luccioman Date: Sat, 13 Jan 2018 10:45:00 +0100 Subject: [PATCH] Use a constant for crawler reject reason prefix with specific processing --- source/net/yacy/crawler/CrawlStacker.java | 9 ++++++--- source/net/yacy/search/Switchboard.java | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 5b66958ba..e0b293c7b 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -66,6 +66,9 @@ public final class CrawlStacker { public static String ERROR_NO_MATCH_MUST_MATCH_FILTER = "url does not match must-match filter "; public static String ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER = "url matches must-not-match filter "; + /** Crawl reject reason prefix having specific processing */ + public static final String CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX = "double in"; + private final static ConcurrentLog log = new ConcurrentLog("STACKCRAWL"); private final RobotsTxt robots; @@ -135,7 +138,7 @@ public final class CrawlStacker { final String rejectReason = stackCrawl(entry); // if the url was rejected we store it into the error URL db - if (rejectReason != null && !rejectReason.startsWith("double in")) { + if (rejectReason != null && !rejectReason.startsWith(CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX)) { final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle())); this.nextQueue.errorURL.push(entry.url(), entry.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); } @@ -411,7 +414,7 @@ public final class CrawlStacker { // check if the url is double registered final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists if (dbocc != null) { - return "double in: " + dbocc.name(); + return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + ": " + dbocc.name(); } String urlhash = ASCII.String(url.hash()); LoadTimeURL oldEntry = null; @@ -452,7 +455,7 @@ public final class CrawlStacker { CrawlStacker.log.fine("RE-CRAWL of URL '" + urlstring + "': this url was crawled " + ((System.currentTimeMillis() - oldDate.longValue()) / 60000 / 60 / 24) + " days ago."); } else { - return "double in: local index, oldDate = " + ISO8601Formatter.FORMATTER.format(new Date(oldDate)); + return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + ": local index, oldDate = " + ISO8601Formatter.FORMATTER.format(new Date(oldDate)); } return null; diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 8a005a772..be4b0515c 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2971,7 +2971,7 @@ public final class Switchboard extends serverSwitch { newDocs.add(doc); } else { // we consider this as fail urls to have a tracking of the problem - if (rejectReason != null && !rejectReason.startsWith("double in")) { + if (rejectReason != null && !rejectReason.startsWith(CrawlStacker.CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX)) { this.crawlStacker.nextQueue.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); } }