From 6bd8c6f195b9e2f1ed0b073d45ec316dd9e37ecc Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 10 Apr 2014 09:08:59 +0200 Subject: [PATCH] fix for wrong status codes of error pages --- htroot/Crawler_p.java | 1 + source/net/yacy/crawler/data/CrawlQueues.java | 18 ++++++++++++------ .../net/yacy/crawler/retrieval/HTTPLoader.java | 16 ++++++++-------- .../net/yacy/repository/LoaderDispatcher.java | 17 +++++++++-------- source/net/yacy/search/index/ErrorCache.java | 10 ++++++++-- 5 files changed, 38 insertions(+), 24 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index d3228267f..1253cc502 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -224,6 +224,7 @@ public class Crawler_p { sb.robots.delete(ru); try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {} } + try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all. // set the crawl filter String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING); diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 42583cfa1..e9431a016 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -639,7 +639,7 @@ public class CrawlQueues { } else { // starting a load from the internet request.setStatus("worker-loading", WorkflowJob.STATUS_RUNNING); - String result = null; + String error = null; // load a resource and push queue entry to switchboard queue // returns null if everything went fine, a fail reason string if a problem occurred @@ -651,23 +651,29 @@ public class CrawlQueues { if (CrawlQueues.log.isFine()) { CrawlQueues.log.fine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); } - result = "no content (possibly caused by cache policy)"; + error = "no content (possibly caused by cache policy)"; } else { request.setStatus("loaded", WorkflowJob.STATUS_RUNNING); final String storedFailMessage = CrawlQueues.this.sb.toIndexer(response); request.setStatus("enqueued-" + ((storedFailMessage == null) ? "ok" : "fail"), WorkflowJob.STATUS_FINISHED); - result = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage; + error = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage; } } catch (final IOException e) { request.setStatus("error", WorkflowJob.STATUS_FINISHED); if (CrawlQueues.log.isFine()) { CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage()); } - result = "load error - " + e.getMessage(); + error = "load error - " + e.getMessage(); } - if (result != null) { - CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1); + if (error != null) { + if (error.endsWith("$")) { + // the "$" mark at the end of the error message means, that the error was already pushed to the error-db by the reporting method + // thus we only push this message if we don't have that mark + error = error.substring(0, error.length() - 1).trim(); + } else { + CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + error, -1); + } request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED); } else { request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED); diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 1f362bcbb..f132c5c1d 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -78,7 +78,7 @@ public final class HTTPLoader { if (retryCount < 0) { this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); - throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted."); + throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); } DigestURL url = request.url(); @@ -94,7 +94,7 @@ public final class HTTPLoader { final String hostlow = host.toLowerCase(); if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); - throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); + throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); } // resolve yacy and yacyh domains @@ -141,7 +141,7 @@ public final class HTTPLoader { if (redirectionUrlString.isEmpty()) { this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode); - throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); + throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } // normalize URL @@ -161,7 +161,7 @@ public final class HTTPLoader { // if we are already doing a shutdown we don't need to retry crawling if (Thread.currentThread().isInterrupted()) { this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); - throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown."); + throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.$"); } // retry crawling with new url @@ -170,11 +170,11 @@ public final class HTTPLoader { } // we don't want to follow redirects this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); - throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); + throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } else if (responseBody == null) { // no response, reject file this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); - throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); + throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } else if (statusCode == 200 || statusCode == 203) { // the transfer is ok @@ -185,7 +185,7 @@ public final class HTTPLoader { // check length again in case it was not possible to get the length before loading if (maxFileSize >= 0 && contentLength > maxFileSize) { this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); - throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); + throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$"); } // create a new cache entry @@ -202,7 +202,7 @@ public final class HTTPLoader { } else { // if the response has not the right response type then reject file this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); - throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); + throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index f31c6e551..062e2b4bc 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -163,12 +163,13 @@ public final class LoaderDispatcher { check = this.loaderSteering.remove(request.url()); if (check != null) check.release(1000); return response; - } catch (final IOException e) { + } catch (final IOException e) { + throw new IOException(e); + } finally { // release the semaphore anyway check = this.loaderSteering.remove(request.url()); - if (check != null) check.release(1000); - // Very noisy: ConcurrentLog.logException(e); - throw new IOException(e); + if (check != null) check.release(1000); + // Very noisy: ConcurrentLog.logException(e); } } @@ -190,7 +191,7 @@ public final class LoaderDispatcher { // check if url is in blacklist if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) { this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); - throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); + throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); } // check if we have the page in the cache @@ -244,13 +245,13 @@ public final class LoaderDispatcher { } } - // check case where we want results from the cache exclusively, and never from the internet (offline mode) + // check case where we want results from the cache exclusively, and never from the Internet (offline mode) if (cacheStrategy == CacheStrategy.CACHEONLY) { // we had a chance to get the content from the cache .. its over. We don't have it. throw new IOException("cache only strategy"); } - // now forget about the cache, nothing there. Try to load the content from the internet + // now forget about the cache, nothing there. Try to load the content from the Internet // check access time: this is a double-check (we checked possibly already in the balancer) // to make sure that we don't DoS the target by mistake @@ -302,7 +303,7 @@ public final class LoaderDispatcher { // no caching wanted. Thats ok, do not write any message return response; } - // second check tells us if the protocoll tells us something about caching + // second check tells us if the protocol tells us something about caching final String storeError = response.shallStoreCacheForCrawler(); if (storeError == null) { try { diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 78638f014..a6eee9b2f 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -114,8 +114,14 @@ public class ErrorCache { if (this.fulltext.getDefaultConnector() != null && failCategory.store) { // send the error to solr try { - SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration()); - this.fulltext.getDefaultConnector().add(errorDoc); + // do not overwrite error reports with error reports + SolrDocument olddoc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName()); + if (olddoc == null || + olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null || + ((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) { + SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration()); + this.fulltext.getDefaultConnector().add(errorDoc); + } } catch (final IOException e) { ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage()); }