*) adding more urls to the error url

- old error strings where replaced with there corresponding constants   
   See: http://www.yacy-forum.de/viewtopic.php?t=2638

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2360 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent d56f06401e
commit 9f298083cd

@ -6,6 +6,10 @@
// Frankfurt, Germany, 2004 // Frankfurt, Germany, 2004
// last major change: 09.08.2004 // last major change: 09.08.2004
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or // the Free Software Foundation; either version 2 of the License, or
@ -59,6 +63,64 @@ import de.anomic.tools.bitfield;
public class plasmaCrawlEURL extends indexURL { public class plasmaCrawlEURL extends indexURL {
/* =======================================================================
* Failure reason constants
* ======================================================================= */
// invalid urls
public static final String DENIED_URL_NULL = "denied_(url_null)";
public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)";
public static final String DENIED_PRIVATE_IP_ADDRESS = "denied_(private_ip_address)";
public static final String DENIED_LOOPBACK_IP_ADDRESS = "denied_(loopback_ip_address)";
public static final String DENIED_CACHEFILE_PATH_TOO_LONG = "denied_(cachefile_path_too_long)";
public static final String DENIED_INVALID_CACHEFILE_PATH = "denied_(invalid_cachefile_path)";
// blacklisted/blocked urls
public static final String DENIED_URL_IN_BLACKLIST = "denied_(url_in_blacklist)";
public static final String DENIED_URL_DOES_NOT_MATCH_FILTER = "denied_(does_not_match_filter)";
public static final String DENIED_CGI_URL = "denied_(cgi_url)";
public static final String DENIED_POST_URL = "denied_(post_url)";
public static final String DENIED_NO_MATCH_WITH_DOMAIN_FILTER = "denied_(no_match_with_domain_filter)";
public static final String DENIED_DOMAIN_COUNT_EXCEEDED = "denied_(domain_count_exceeded)";
public static final String DENIED_ROBOTS_TXT = "denied_(robots.txt)";
// wrong content
public static final String DENIED_WRONG_MIMETYPE_OR_EXT = "denied_(wrong_mimetype_or_extension)";
public static final String DENIED_REDIRECTION_HEADER_EMPTY = "denied_(redirection_header_empty)";
public static final String DENIED_REDIRECTION_COUNTER_EXCEEDED = "denied_(redirection_counter_exceeded)";
public static final String DENIED_WRONG_HTTP_STATUSCODE = "denied_(wrong_http_status_code_";
public static final String DENIED_CONTENT_DECODING_ERROR = "denied_(content_decoding_error)";
// network errors
public static final String DENIED_UNKNOWN_HOST = "denied_(unknown_host)";
public static final String DENIED_NO_ROUTE_TO_HOST = "denied_(no_route_to_host)";
public static final String DENIED_NETWORK_IS_UNREACHABLE = "denied_(Network_is_unreachable)";
// connection errors
public static final String DENIED_CONNECTION_ERROR = "denied_(socket_connection_error)";
public static final String DENIED_CONNECTION_BIND_EXCEPTION = "denied_(connection_bind_exception)";
public static final String DENIED_CONNECTION_TIMEOUT = "denied_(connection_timeout)";
public static final String DENIED_CONNECTION_REFUSED = "denied_(connection_refused)";
public static final String DENIED_SSL_UNTRUSTED_CERT = "denied_(No_trusted_ssl_certificate_found)";
// double registered errors
public static final String DOUBLE_REGISTERED = "double_(registered_in_";
// server errors
public static final String DENIED_OUT_OF_DISK_SPACE = "denied_(out_of_disk_space)";
public static final String DENIED_SERVER_SHUTDOWN = "denied_(server_shutdown)";
// Parser errors
public static final String DENIED_PARSER_ERROR = "denied_(parser_error)";
public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)";
// indexing errors
public static final String DENIED_UNSPECIFIED_INDEXING_ERROR = "denied_(unspecified_indexing_error)";
public static final String DENIED_UNKNOWN_INDEXING_PROCESS_CASE = "denied_(unknown_indexing_process_case)";
/* =======================================================================
* Other object variables
* ======================================================================= */
private LinkedList rejectedStack = new LinkedList(); // strings: url private LinkedList rejectedStack = new LinkedList(); // strings: url
public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime) { public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime) {

@ -231,7 +231,7 @@ public final class plasmaCrawlStacker {
// strange errors // strange errors
if (nexturlString == null) { if (nexturlString == null) {
reason = "denied_(url_null)"; reason = plasmaCrawlEURL.DENIED_URL_NULL;
this.log.logSevere("Wrong URL in stackCrawl: url=null"); this.log.logSevere("Wrong URL in stackCrawl: url=null");
return reason; return reason;
} }
@ -256,7 +256,7 @@ public final class plasmaCrawlStacker {
try { try {
nexturl = new URL(nexturlString); nexturl = new URL(nexturlString);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
reason = "denied_(url_'" + nexturlString + "'_wrong)"; reason = plasmaCrawlEURL.DENIED_MALFORMED_URL;
this.log.logSevere("Wrong URL in stackCrawl: " + nexturlString + this.log.logSevere("Wrong URL in stackCrawl: " + nexturlString +
". Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); ". Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason; return reason;
@ -265,17 +265,17 @@ public final class plasmaCrawlStacker {
// check if ip is local ip address // check if ip is local ip address
InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost()); InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost());
if (hostAddress == null) { if (hostAddress == null) {
reason = "denied_(unknown_host)"; reason = plasmaCrawlEURL.DENIED_UNKNOWN_HOST;
this.log.logFine("Unknown host in URL '" + nexturlString + "'. " + this.log.logFine("Unknown host in URL '" + nexturlString + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason; return reason;
} else if (hostAddress.isSiteLocalAddress()) { } else if (hostAddress.isSiteLocalAddress()) {
reason = "denied_(private_ip_address)"; reason = plasmaCrawlEURL.DENIED_PRIVATE_IP_ADDRESS;
this.log.logFine("Host in URL '" + nexturlString + "' has private IP address. " + this.log.logFine("Host in URL '" + nexturlString + "' has private IP address. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason; return reason;
} else if (hostAddress.isLoopbackAddress()) { } else if (hostAddress.isLoopbackAddress()) {
reason = "denied_(loopback_ip_address)"; reason = plasmaCrawlEURL.DENIED_LOOPBACK_IP_ADDRESS;
this.log.logFine("Host in URL '" + nexturlString + "' has loopback IP address. " + this.log.logFine("Host in URL '" + nexturlString + "' has loopback IP address. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason; return reason;
@ -283,7 +283,7 @@ public final class plasmaCrawlStacker {
// check blacklist // check blacklist
if (plasmaSwitchboard.urlBlacklist.isListed(nexturl)) { if (plasmaSwitchboard.urlBlacklist.isListed(nexturl)) {
reason = "denied_(url_in_blacklist)"; reason = plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST;
this.log.logFine("URL '" + nexturlString + "' is in blacklist. " + this.log.logFine("URL '" + nexturlString + "' is in blacklist. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason; return reason;
@ -291,7 +291,7 @@ public final class plasmaCrawlStacker {
// filter deny // filter deny
if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) { if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) {
reason = "denied_(does_not_match_filter)"; reason = plasmaCrawlEURL.DENIED_URL_DOES_NOT_MATCH_FILTER;
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
@ -302,7 +302,7 @@ public final class plasmaCrawlStacker {
// deny cgi // deny cgi
if (plasmaHTCache.isCGI(nexturlString)) { if (plasmaHTCache.isCGI(nexturlString)) {
reason = "denied_(cgi_url)"; reason = plasmaCrawlEURL.DENIED_CGI_URL;
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
@ -313,7 +313,7 @@ public final class plasmaCrawlStacker {
// deny post properties // deny post properties
if ((plasmaHTCache.isPOST(nexturlString)) && (profile != null) && (!(profile.crawlingQ()))) { if ((plasmaHTCache.isPOST(nexturlString)) && (profile != null) && (!(profile.crawlingQ()))) {
reason = "denied_(post_url)"; reason = plasmaCrawlEURL.DENIED_POST_URL;
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
@ -329,7 +329,7 @@ public final class plasmaCrawlStacker {
// deny urls that do not match with the profile domain list // deny urls that do not match with the profile domain list
if (!(profile.grantedDomAppearance(nexturl.getHost()))) { if (!(profile.grantedDomAppearance(nexturl.getHost()))) {
reason = "denied_(no_match_with_domain_filter)"; reason = plasmaCrawlEURL.DENIED_NO_MATCH_WITH_DOMAIN_FILTER;
this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " + this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason; return reason;
@ -337,7 +337,7 @@ public final class plasmaCrawlStacker {
// deny urls that exceed allowed number of occurrences // deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(nexturl.getHost()))) { if (!(profile.grantedDomCount(nexturl.getHost()))) {
reason = "denied_(domain_count_exceeded)"; reason = plasmaCrawlEURL.DENIED_DOMAIN_COUNT_EXCEEDED;
this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+ this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason; return reason;
@ -352,7 +352,7 @@ public final class plasmaCrawlStacker {
boolean recrawl = (oldEntry != null) && boolean recrawl = (oldEntry != null) &&
(((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) { if ((dbocc != null) && (!(recrawl))) {
reason = "double_(registered_in_" + dbocc + ")"; reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
@ -363,7 +363,7 @@ public final class plasmaCrawlStacker {
// checking robots.txt // checking robots.txt
if (robotsParser.isDisallowed(nexturl)) { if (robotsParser.isDisallowed(nexturl)) {
reason = "denied_(robots.txt)"; reason = plasmaCrawlEURL.DENIED_ROBOTS_TXT;
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/

@ -281,7 +281,7 @@ public final class plasmaCrawlWorker extends Thread {
private static plasmaHTCache.Entry load( private static plasmaHTCache.Entry load(
URL url, URL url,
String name, String name,
String referer, String refererURLString,
String initiator, String initiator,
int depth, int depth,
plasmaCrawlProfile.entry profile, plasmaCrawlProfile.entry profile,
@ -308,29 +308,19 @@ public final class plasmaCrawlWorker extends Thread {
boolean ssl = url.getProtocol().equals("https"); boolean ssl = url.getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80; if (port < 0) port = (ssl) ? 443 : 80;
refererURLString = (refererURLString == null) ? "" : refererURLString.trim();
// check if url is in blacklist // check if url is in blacklist
String hostlow = host.toLowerCase(); String hostlow = host.toLowerCase();
if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) { if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist."); log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry( addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST, new bitfield(indexURL.urlFlagLength));
url,
referer,
initiator,
yacyCore.seedDB.mySeed.hash,
name,
"denied_(url_in_blacklist)",
new bitfield(indexURL.urlFlagLength)
);
ee.store();
sb.urlPool.errorURL.stackPushEntry(ee);
return null; return null;
} }
// TODO: resolve yacy and yacyh domains // TODO: resolve yacy and yacyh domains
//String yAddress = yacyCore.seedDB.resolveYacyAddress(host); //String yAddress = yacyCore.seedDB.resolveYacyAddress(host);
referer = (referer == null) ? "" : referer.trim();
// take a file from the net // take a file from the net
httpc remote = null; httpc remote = null;
plasmaHTCache.Entry htCache = null; plasmaHTCache.Entry htCache = null;
@ -338,7 +328,7 @@ public final class plasmaCrawlWorker extends Thread {
// create a request header // create a request header
httpHeader requestHeader = new httpHeader(); httpHeader requestHeader = new httpHeader();
requestHeader.put(httpHeader.USER_AGENT, httpdProxyHandler.crawlerUserAgent); requestHeader.put(httpHeader.USER_AGENT, httpdProxyHandler.crawlerUserAgent);
requestHeader.put(httpHeader.REFERER, referer); requestHeader.put(httpHeader.REFERER, refererURLString);
requestHeader.put(httpHeader.ACCEPT_LANGUAGE, sb.getConfig("crawler.acceptLanguage","en-us,en;q=0.5")); requestHeader.put(httpHeader.ACCEPT_LANGUAGE, sb.getConfig("crawler.acceptLanguage","en-us,en;q=0.5"));
requestHeader.put(httpHeader.ACCEPT_CHARSET, sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7")); requestHeader.put(httpHeader.ACCEPT_CHARSET, sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
if (useContentEncodingGzip) requestHeader.put(httpHeader.ACCEPT_ENCODING, "gzip,deflate"); if (useContentEncodingGzip) requestHeader.put(httpHeader.ACCEPT_ENCODING, "gzip,deflate");
@ -358,15 +348,15 @@ public final class plasmaCrawlWorker extends Thread {
if (res.status.startsWith("200") || res.status.startsWith("203")) { if (res.status.startsWith("200") || res.status.startsWith("203")) {
// the transfer is ok // the transfer is ok
// TODO: aborting download if content is to long ... // create a new cache entry
//long contentLength = res.responseHeader.contentLength();
htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile); htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile);
// aborting download if content is to long ...
if (htCache.cacheFile.getAbsolutePath().length() > serverSystem.maxPathLength) { if (htCache.cacheFile.getAbsolutePath().length() > serverSystem.maxPathLength) {
remote.close(); remote.close();
log.logInfo("REJECTED URL " + url.toString() + " because path too long '" + log.logInfo("REJECTED URL " + url.toString() + " because path too long '" + cacheManager.cachePath.getAbsolutePath() + "'");
cacheManager.cachePath.getAbsolutePath() + "'"); addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_CACHEFILE_PATH_TOO_LONG, new bitfield(indexURL.urlFlagLength));
return (htCache = null); return (htCache = null);
} }
@ -377,6 +367,7 @@ public final class plasmaCrawlWorker extends Thread {
log.logInfo("REJECTED URL " + url.toString() + " because of an invalid file path ('" + log.logInfo("REJECTED URL " + url.toString() + " because of an invalid file path ('" +
htCache.cacheFile.getCanonicalPath() + "' does not start with '" + htCache.cacheFile.getCanonicalPath() + "' does not start with '" +
cacheManager.cachePath.getAbsolutePath() + "')."); cacheManager.cachePath.getAbsolutePath() + "').");
addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_INVALID_CACHEFILE_PATH, new bitfield(indexURL.urlFlagLength));
return (htCache = null); return (htCache = null);
} }
@ -408,6 +399,7 @@ public final class plasmaCrawlWorker extends Thread {
// if the response has not the right file type then reject file // if the response has not the right file type then reject file
remote.close(); remote.close();
log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.responseHeader.mime() + " for URL " + url.toString()); log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.responseHeader.mime() + " for URL " + url.toString());
addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT, new bitfield(indexURL.urlFlagLength));
htCache = null; htCache = null;
} }
} catch (SocketException e) { } catch (SocketException e) {
@ -418,6 +410,7 @@ public final class plasmaCrawlWorker extends Thread {
// and most possible corrupted // and most possible corrupted
if (cacheFile.exists()) cacheFile.delete(); if (cacheFile.exists()) cacheFile.delete();
log.logSevere("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString()); log.logSevere("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString());
addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_CONNECTION_ERROR, new bitfield(indexURL.urlFlagLength));
htCache = null; htCache = null;
} }
} else if (res.status.startsWith("30")) { } else if (res.status.startsWith("30")) {
@ -429,6 +422,7 @@ public final class plasmaCrawlWorker extends Thread {
if (redirectionUrlString.length() == 0) { if (redirectionUrlString.length() == 0) {
log.logWarning("CRAWLER Redirection of URL=" + url.toString() + " aborted. Location header is empty."); log.logWarning("CRAWLER Redirection of URL=" + url.toString() + " aborted. Location header is empty.");
addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_REDIRECTION_HEADER_EMPTY, new bitfield(indexURL.urlFlagLength));
return null; return null;
} }
@ -449,6 +443,7 @@ public final class plasmaCrawlWorker extends Thread {
// if we are already doing a shutdown we don't need to retry crawling // if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) { if (Thread.currentThread().isInterrupted()) {
log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown."); log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown.");
addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_SERVER_SHUTDOWN, new bitfield(indexURL.urlFlagLength));
return null; return null;
} }
@ -461,7 +456,7 @@ public final class plasmaCrawlWorker extends Thread {
// retry crawling with new url // retry crawling with new url
plasmaHTCache.Entry redirectedEntry = load(redirectionUrl, plasmaHTCache.Entry redirectedEntry = load(redirectionUrl,
name, name,
referer, refererURLString,
initiator, initiator,
depth, depth,
profile, profile,
@ -492,13 +487,14 @@ public final class plasmaCrawlWorker extends Thread {
} }
} else { } else {
log.logInfo("Redirection counter exceeded for URL " + url.toString() + ". Processing aborted."); log.logInfo("Redirection counter exceeded for URL " + url.toString() + ". Processing aborted.");
addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_REDIRECTION_COUNTER_EXCEEDED, new bitfield(indexURL.urlFlagLength));
} }
}else { }else {
// if the response has not the right response type then reject file // if the response has not the right response type then reject file
log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for URL " + url.toString()); log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for URL " + url.toString());
// not processed any further
// TODO: add the url into the error url DB // not processed any further
addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_WRONG_HTTP_STATUSCODE + res.statusCode + ")", new bitfield(indexURL.urlFlagLength));
} }
if (remote != null) remote.close(); if (remote != null) remote.close();
@ -506,6 +502,7 @@ public final class plasmaCrawlWorker extends Thread {
} catch (Exception e) { } catch (Exception e) {
boolean retryCrawling = false; boolean retryCrawling = false;
String errorMsg = e.getMessage(); String errorMsg = e.getMessage();
String failreason = null;
if ((e instanceof IOException) && if ((e instanceof IOException) &&
(errorMsg != null) && (errorMsg != null) &&
@ -513,47 +510,61 @@ public final class plasmaCrawlWorker extends Thread {
(Thread.currentThread().isInterrupted()) (Thread.currentThread().isInterrupted())
) { ) {
log.logInfo("CRAWLER Interruption detected because of server shutdown."); log.logInfo("CRAWLER Interruption detected because of server shutdown.");
failreason = plasmaCrawlEURL.DENIED_SERVER_SHUTDOWN;
} else if (e instanceof MalformedURLException) { } else if (e instanceof MalformedURLException) {
log.logWarning("CRAWLER Malformed URL '" + url.toString() + "' detected. "); log.logWarning("CRAWLER Malformed URL '" + url.toString() + "' detected. ");
failreason = plasmaCrawlEURL.DENIED_MALFORMED_URL;
} else if (e instanceof NoRouteToHostException) { } else if (e instanceof NoRouteToHostException) {
log.logWarning("CRAWLER No route to host found while trying to crawl URL '" + url.toString() + "'."); log.logWarning("CRAWLER No route to host found while trying to crawl URL '" + url.toString() + "'.");
failreason = plasmaCrawlEURL.DENIED_NO_ROUTE_TO_HOST;
} else if ((e instanceof UnknownHostException) || } else if ((e instanceof UnknownHostException) ||
((errorMsg != null) && (errorMsg.indexOf("unknown host") >= 0))) { ((errorMsg != null) && (errorMsg.indexOf("unknown host") >= 0))) {
log.logWarning("CRAWLER Unknown host in URL '" + url.toString() + "'. " + log.logWarning("CRAWLER Unknown host in URL '" + url.toString() + "'. " +
"Referer URL: " + ((referer == null) ?"Unknown":referer)); "Referer URL: " + ((refererURLString == null) ?"Unknown":refererURLString));
failreason = plasmaCrawlEURL.DENIED_UNKNOWN_HOST;
} else if (e instanceof java.net.BindException) { } else if (e instanceof java.net.BindException) {
log.logWarning("CRAWLER BindException detected while trying to download content from '" + url.toString() + log.logWarning("CRAWLER BindException detected while trying to download content from '" + url.toString() +
"'. Retrying request."); "'. Retrying request.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_BIND_EXCEPTION;
retryCrawling = true; retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("Corrupt GZIP trailer") >= 0)) { } else if ((errorMsg != null) && (errorMsg.indexOf("Corrupt GZIP trailer") >= 0)) {
log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + url.toString() + log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + url.toString() +
"'. Retrying request without using gzip content encoding."); "'. Retrying request without using gzip content encoding.");
failreason = plasmaCrawlEURL.DENIED_CONTENT_DECODING_ERROR;
retryCrawling = true; retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("Read timed out") >= 0)) { } else if ((errorMsg != null) && (errorMsg.indexOf("Read timed out") >= 0)) {
log.logWarning("CRAWLER Read timeout while receiving content from '" + url.toString() + log.logWarning("CRAWLER Read timeout while receiving content from '" + url.toString() +
"'. Retrying request."); "'. Retrying request.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT;
retryCrawling = true; retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("connect timed out") >= 0)) { } else if ((errorMsg != null) && (errorMsg.indexOf("connect timed out") >= 0)) {
log.logWarning("CRAWLER Timeout while trying to connect to '" + url.toString() + log.logWarning("CRAWLER Timeout while trying to connect to '" + url.toString() +
"'. Retrying request."); "'. Retrying request.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT;
retryCrawling = true; retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("Connection timed out") >= 0)) { } else if ((errorMsg != null) && (errorMsg.indexOf("Connection timed out") >= 0)) {
log.logWarning("CRAWLER Connection timeout while receiving content from '" + url.toString() + log.logWarning("CRAWLER Connection timeout while receiving content from '" + url.toString() +
"'. Retrying request."); "'. Retrying request.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT;
retryCrawling = true; retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("Connection refused") >= 0)) { } else if ((errorMsg != null) && (errorMsg.indexOf("Connection refused") >= 0)) {
log.logWarning("CRAWLER Connection refused while trying to connect to '" + url.toString() + "'."); log.logWarning("CRAWLER Connection refused while trying to connect to '" + url.toString() + "'.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_REFUSED;
} else if ((errorMsg != null) && (errorMsg.indexOf("There is not enough space on the disk") >= 0)) { } else if ((errorMsg != null) && (errorMsg.indexOf("There is not enough space on the disk") >= 0)) {
log.logSevere("CRAWLER Not enough space on the disk detected while crawling '" + url.toString() + "'. " + log.logSevere("CRAWLER Not enough space on the disk detected while crawling '" + url.toString() + "'. " +
"Pausing crawlers. "); "Pausing crawlers. ");
plasmaCrawlLoader.switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL); plasmaCrawlLoader.switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
plasmaCrawlLoader.switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); plasmaCrawlLoader.switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
failreason = plasmaCrawlEURL.DENIED_OUT_OF_DISK_SPACE;
} else if ((errorMsg != null) && (errorMsg.indexOf("Network is unreachable") >=0)) { } else if ((errorMsg != null) && (errorMsg.indexOf("Network is unreachable") >=0)) {
log.logSevere("CRAWLER Network is unreachable while trying to crawl URL '" + url.toString() + "'. "); log.logSevere("CRAWLER Network is unreachable while trying to crawl URL '" + url.toString() + "'. ");
failreason = plasmaCrawlEURL.DENIED_NETWORK_IS_UNREACHABLE;
} else if ((errorMsg != null) && (errorMsg.indexOf("No trusted certificate found")>= 0)) { } else if ((errorMsg != null) && (errorMsg.indexOf("No trusted certificate found")>= 0)) {
log.logSevere("CRAWLER No trusted certificate found for URL '" + url.toString() + "'. "); log.logSevere("CRAWLER No trusted certificate found for URL '" + url.toString() + "'. ");
failreason = plasmaCrawlEURL.DENIED_SSL_UNTRUSTED_CERT;
} else { } else {
log.logSevere("CRAWLER Unexpected Error with URL '" + url.toString() + "': " + e.toString(),e); log.logSevere("CRAWLER Unexpected Error with URL '" + url.toString() + "': " + e.toString(),e);
failreason = plasmaCrawlEURL.DENIED_CONNECTION_ERROR;
} }
if (retryCrawling) { if (retryCrawling) {
@ -573,7 +584,7 @@ public final class plasmaCrawlWorker extends Thread {
// retry crawling // retry crawling
return load(url, return load(url,
name, name,
referer, refererURLString,
initiator, initiator,
depth, depth,
profile, profile,
@ -586,10 +597,43 @@ public final class plasmaCrawlWorker extends Thread {
false false
); );
} }
if (failreason != null) {
addURLtoErrorDB(url, refererURLString, initiator, name, failreason, new bitfield(indexURL.urlFlagLength));
}
return null; return null;
} finally { } finally {
if (remote != null) httpc.returnInstance(remote); if (remote != null) httpc.returnInstance(remote);
} }
} }
private static void addURLtoErrorDB(
URL url,
String referrerString,
String initiator,
String name,
String failreason,
bitfield flags
) {
// getting a reference to the plasmaSwitchboard
plasmaSwitchboard sb = plasmaCrawlLoader.switchboard;
// convert the referrer URL into a hash value
String referrerHash = (referrerString==null)?null:indexURL.urlHash(referrerString);
// create a new errorURL DB entry
plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
url,
referrerHash,
initiator,
yacyCore.seedDB.mySeed.hash,
name,
failreason,
flags
);
// store the entry
ee.store();
// push it onto the stack
sb.urlPool.errorURL.stackPushEntry(ee);
}
} }

@ -1362,14 +1362,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
document = parser.parseSource(entry.url(), mimeType, entry.cacheFile()); document = parser.parseSource(entry.url(), mimeType, entry.cacheFile());
} else { } else {
log.logFine("(Parser) '" + entry.normalizedURLString() + "' cannot be parsed, no resource available"); log.logFine("(Parser) '" + entry.normalizedURLString() + "' cannot be parsed, no resource available");
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength));
return; return;
} }
if (document == null) { if (document == null) {
log.logSevere("(Parser) '" + entry.normalizedURLString() + "' parse failure"); log.logSevere("(Parser) '" + entry.normalizedURLString() + "' parse failure");
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_PARSER_ERROR, new bitfield(indexURL.urlFlagLength));
return; return;
} }
} else { } else {
log.logFine("(Parser) '" + entry.normalizedURLString() + "'. Unsupported mimeType '" + ((mimeType == null) ? "null" : mimeType) + "'."); log.logFine("(Parser) '" + entry.normalizedURLString() + "'. Unsupported mimeType '" + ((mimeType == null) ? "null" : mimeType) + "'.");
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT, new bitfield(indexURL.urlFlagLength));
return; return;
} }
parsingEndTime = System.currentTimeMillis(); parsingEndTime = System.currentTimeMillis();
@ -1415,7 +1418,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
referrerHash = indexURL.urlHash(referrerURL); referrerHash = indexURL.urlHash(referrerURL);
if (referrerHash == null) referrerHash = indexURL.dummyHash; if (referrerHash == null) referrerHash = indexURL.dummyHash;
String noIndexReason = "unspecified"; String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR;
if (processCase == 4) { if (processCase == 4) {
// proxy-load // proxy-load
noIndexReason = entry.shallIndexCacheForProxy(); noIndexReason = entry.shallIndexCacheForProxy();
@ -1558,22 +1561,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
} else { } else {
log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase); log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerHash, initiatorHash, descr, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexURL.urlFlagLength));
} }
} catch (Exception ee) { } catch (Exception ee) {
log.logSevere("Could not index URL " + entry.url() + ": " + ee.getMessage(), ee); log.logSevere("Could not index URL " + entry.url() + ": " + ee.getMessage(), ee);
if ((processCase == 6) && (initiator != null)) { if ((processCase == 6) && (initiator != null)) {
yacyClient.crawlReceipt(initiator, "crawl", "exception", ee.getMessage(), null, ""); yacyClient.crawlReceipt(initiator, "crawl", "exception", ee.getMessage(), null, "");
} }
addURLtoErrorDB(entry.url(), referrerHash, initiatorHash, descr, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexURL.urlFlagLength));
} }
} else { } else {
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
plasmaCrawlEURL.Entry ee = urlPool.errorURL.newEntry(entry.url(), referrerHash, addURLtoErrorDB(entry.url(), referrerHash, initiatorHash, descr, noIndexReason, new bitfield(indexURL.urlFlagLength));
((entry.proxy()) ? indexURL.dummyHash : entry.initiator()),
yacyCore.seedDB.mySeed.hash,
descr, noIndexReason, new bitfield(indexURL.urlFlagLength));
ee.store();
urlPool.errorURL.stackPushEntry(ee);
if ((processCase == 6) && (initiator != null)) { if ((processCase == 6) && (initiator != null)) {
yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, ""); yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, "");
} }
@ -1687,6 +1687,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return; return;
} }
// convert the referrer hash into the corresponding URL
URL refererURL = null; URL refererURL = null;
String refererHash = urlEntry.referrerHash(); String refererHash = urlEntry.referrerHash();
if ((refererHash != null) && (!refererHash.equals(indexURL.dummyHash))) try { if ((refererHash != null) && (!refererHash.equals(indexURL.dummyHash))) try {
@ -2246,6 +2247,30 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
} }
private void addURLtoErrorDB(
URL url,
String referrerHash,
String initiator,
String name,
String failreason,
bitfield flags
) {
// create a new errorURL DB entry
plasmaCrawlEURL.Entry ee = this.urlPool.errorURL.newEntry(
url,
referrerHash,
initiator,
yacyCore.seedDB.mySeed.hash,
name,
failreason,
flags
);
// store the entry
ee.store();
// push it onto the stack
this.urlPool.errorURL.stackPushEntry(ee);
}
public void terminate(long delay) { public void terminate(long delay) {
if (delay <= 0) throw new IllegalArgumentException("The shutdown delay must be greater than 0."); if (delay <= 0) throw new IllegalArgumentException("The shutdown delay must be greater than 0.");
(new delayedShutdown(this,delay)).start(); (new delayedShutdown(this,delay)).start();

@ -334,6 +334,10 @@ public class plasmaSwitchboardQueue {
} }
return referrerURL; return referrerURL;
} }
public String referrerHash() {
return referrerHash;
}
public String anchorName() { public String anchorName() {
return anchorName; return anchorName;

Loading…
Cancel
Save