fixed problem with redirection: redirected URLs had not been tested with the double-check

see also: http://forum.yacy-websuche.de/viewtopic.php?f=6&t=348

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4126 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent b183bf6f42
commit c1440d2241

@ -331,9 +331,14 @@ public final class CrawlWorker extends AbstractCrawlWorker {
// generating url hash
String urlhash = redirectionUrl.hash();
// removing url from loader queue
plasmaCrawlLoader.switchboard.noticeURL.removeByURLHash(urlhash);
// check if the url was already indexed
String dbname = plasmaCrawlLoader.switchboard.urlExists(urlhash);
if (dbname != null) {
this.log.logWarning("CRAWLER Redirection of URL=" + this.url.toString() + " ignored. The url appears already in db " + dbname);
addURLtoErrorDB(plasmaCrawlEURL.DENIED_REDIRECTION_TO_DOUBLE_CONTENT);
return null;
}
// retry crawling with new url
this.url = redirectionUrl;
plasmaHTCache.Entry redirectedEntry = load(crawlingRetryCount-1);

@ -55,6 +55,7 @@ public class plasmaCrawlEURL {
public static final String DENIED_UNSUPPORTED_CHARSET = "denied_(unsupported_charset)";
public static final String DENIED_REDIRECTION_HEADER_EMPTY = "denied_(redirection_header_empty)";
public static final String DENIED_REDIRECTION_COUNTER_EXCEEDED = "denied_(redirection_counter_exceeded)";
public static final String DENIED_REDIRECTION_TO_DOUBLE_CONTENT = "denied_(redirection_to_double_content)";
public static final String DENIED_WRONG_HTTP_STATUSCODE = "denied_(wrong_http_status_code_";
public static final String DENIED_CONTENT_DECODING_ERROR = "denied_(content_decoding_error)";
public static final String DENIED_FILESIZE_LIMIT_EXCEEDED = "denied_(filesize_limit_exceeded)";

Loading…
Cancel
Save