From c1440d22415546b998411067b77d111214afc3c2 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 2 Oct 2007 22:40:53 +0000 Subject: [PATCH] fixed problem with redirection: redirected URLs had not been tested with the double-check see also: http://forum.yacy-websuche.de/viewtopic.php?f=6&t=348 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4126 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/crawler/http/CrawlWorker.java | 11 ++++++++--- source/de/anomic/plasma/plasmaCrawlEURL.java | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index d50f41495..d02d044cd 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -331,9 +331,14 @@ public final class CrawlWorker extends AbstractCrawlWorker { // generating url hash String urlhash = redirectionUrl.hash(); - // removing url from loader queue - plasmaCrawlLoader.switchboard.noticeURL.removeByURLHash(urlhash); - + // check if the url was already indexed + String dbname = plasmaCrawlLoader.switchboard.urlExists(urlhash); + if (dbname != null) { + this.log.logWarning("CRAWLER Redirection of URL=" + this.url.toString() + " ignored. The url appears already in db " + dbname); + addURLtoErrorDB(plasmaCrawlEURL.DENIED_REDIRECTION_TO_DOUBLE_CONTENT); + return null; + } + // retry crawling with new url this.url = redirectionUrl; plasmaHTCache.Entry redirectedEntry = load(crawlingRetryCount-1); diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 34518df4e..e9763e65a 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -55,6 +55,7 @@ public class plasmaCrawlEURL { public static final String DENIED_UNSUPPORTED_CHARSET = "denied_(unsupported_charset)"; public static final String DENIED_REDIRECTION_HEADER_EMPTY = "denied_(redirection_header_empty)"; public static final String DENIED_REDIRECTION_COUNTER_EXCEEDED = "denied_(redirection_counter_exceeded)"; + public static final String DENIED_REDIRECTION_TO_DOUBLE_CONTENT = "denied_(redirection_to_double_content)"; public static final String DENIED_WRONG_HTTP_STATUSCODE = "denied_(wrong_http_status_code_"; public static final String DENIED_CONTENT_DECODING_ERROR = "denied_(content_decoding_error)"; public static final String DENIED_FILESIZE_LIMIT_EXCEEDED = "denied_(filesize_limit_exceeded)";