From 9f298083cd2154de7a0470e83c5321c81749e2de Mon Sep 17 00:00:00 2001 From: theli Date: Mon, 7 Aug 2006 15:11:14 +0000 Subject: [PATCH] *) adding more urls to the error url - old error strings where replaced with there corresponding constants See: http://www.yacy-forum.de/viewtopic.php?t=2638 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2360 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaCrawlEURL.java | 62 ++++++++++++ .../de/anomic/plasma/plasmaCrawlStacker.java | 26 ++--- .../de/anomic/plasma/plasmaCrawlWorker.java | 98 ++++++++++++++----- .../de/anomic/plasma/plasmaSwitchboard.java | 39 ++++++-- .../anomic/plasma/plasmaSwitchboardQueue.java | 4 + 5 files changed, 182 insertions(+), 47 deletions(-) diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 6b54cf233..e393e651c 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -6,6 +6,10 @@ // Frankfurt, Germany, 2004 // last major change: 09.08.2004 // +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -59,6 +63,64 @@ import de.anomic.tools.bitfield; public class plasmaCrawlEURL extends indexURL { + /* ======================================================================= + * Failure reason constants + * ======================================================================= */ + // invalid urls + public static final String DENIED_URL_NULL = "denied_(url_null)"; + public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)"; + public static final String DENIED_PRIVATE_IP_ADDRESS = "denied_(private_ip_address)"; + public static final String DENIED_LOOPBACK_IP_ADDRESS = "denied_(loopback_ip_address)"; + public static final String DENIED_CACHEFILE_PATH_TOO_LONG = "denied_(cachefile_path_too_long)"; + public static final String DENIED_INVALID_CACHEFILE_PATH = "denied_(invalid_cachefile_path)"; + + // blacklisted/blocked urls + public static final String DENIED_URL_IN_BLACKLIST = "denied_(url_in_blacklist)"; + public static final String DENIED_URL_DOES_NOT_MATCH_FILTER = "denied_(does_not_match_filter)"; + public static final String DENIED_CGI_URL = "denied_(cgi_url)"; + public static final String DENIED_POST_URL = "denied_(post_url)"; + public static final String DENIED_NO_MATCH_WITH_DOMAIN_FILTER = "denied_(no_match_with_domain_filter)"; + public static final String DENIED_DOMAIN_COUNT_EXCEEDED = "denied_(domain_count_exceeded)"; + public static final String DENIED_ROBOTS_TXT = "denied_(robots.txt)"; + + // wrong content + public static final String DENIED_WRONG_MIMETYPE_OR_EXT = "denied_(wrong_mimetype_or_extension)"; + public static final String DENIED_REDIRECTION_HEADER_EMPTY = "denied_(redirection_header_empty)"; + public static final String DENIED_REDIRECTION_COUNTER_EXCEEDED = "denied_(redirection_counter_exceeded)"; + public static final String DENIED_WRONG_HTTP_STATUSCODE = "denied_(wrong_http_status_code_"; + public static final String DENIED_CONTENT_DECODING_ERROR = "denied_(content_decoding_error)"; + + // network errors + public static final String DENIED_UNKNOWN_HOST = "denied_(unknown_host)"; + public static final String DENIED_NO_ROUTE_TO_HOST = "denied_(no_route_to_host)"; + public static final String DENIED_NETWORK_IS_UNREACHABLE = "denied_(Network_is_unreachable)"; + + // connection errors + public static final String DENIED_CONNECTION_ERROR = "denied_(socket_connection_error)"; + public static final String DENIED_CONNECTION_BIND_EXCEPTION = "denied_(connection_bind_exception)"; + public static final String DENIED_CONNECTION_TIMEOUT = "denied_(connection_timeout)"; + public static final String DENIED_CONNECTION_REFUSED = "denied_(connection_refused)"; + public static final String DENIED_SSL_UNTRUSTED_CERT = "denied_(No_trusted_ssl_certificate_found)"; + + // double registered errors + public static final String DOUBLE_REGISTERED = "double_(registered_in_"; + + // server errors + public static final String DENIED_OUT_OF_DISK_SPACE = "denied_(out_of_disk_space)"; + public static final String DENIED_SERVER_SHUTDOWN = "denied_(server_shutdown)"; + + // Parser errors + public static final String DENIED_PARSER_ERROR = "denied_(parser_error)"; + public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)"; + + // indexing errors + public static final String DENIED_UNSPECIFIED_INDEXING_ERROR = "denied_(unspecified_indexing_error)"; + public static final String DENIED_UNKNOWN_INDEXING_PROCESS_CASE = "denied_(unknown_indexing_process_case)"; + + + /* ======================================================================= + * Other object variables + * ======================================================================= */ private LinkedList rejectedStack = new LinkedList(); // strings: url public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime) { diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 69727a2f2..3ee99b130 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -231,7 +231,7 @@ public final class plasmaCrawlStacker { // strange errors if (nexturlString == null) { - reason = "denied_(url_null)"; + reason = plasmaCrawlEURL.DENIED_URL_NULL; this.log.logSevere("Wrong URL in stackCrawl: url=null"); return reason; } @@ -256,7 +256,7 @@ public final class plasmaCrawlStacker { try { nexturl = new URL(nexturlString); } catch (MalformedURLException e) { - reason = "denied_(url_'" + nexturlString + "'_wrong)"; + reason = plasmaCrawlEURL.DENIED_MALFORMED_URL; this.log.logSevere("Wrong URL in stackCrawl: " + nexturlString + ". Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -265,17 +265,17 @@ public final class plasmaCrawlStacker { // check if ip is local ip address InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost()); if (hostAddress == null) { - reason = "denied_(unknown_host)"; + reason = plasmaCrawlEURL.DENIED_UNKNOWN_HOST; this.log.logFine("Unknown host in URL '" + nexturlString + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; } else if (hostAddress.isSiteLocalAddress()) { - reason = "denied_(private_ip_address)"; + reason = plasmaCrawlEURL.DENIED_PRIVATE_IP_ADDRESS; this.log.logFine("Host in URL '" + nexturlString + "' has private IP address. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; } else if (hostAddress.isLoopbackAddress()) { - reason = "denied_(loopback_ip_address)"; + reason = plasmaCrawlEURL.DENIED_LOOPBACK_IP_ADDRESS; this.log.logFine("Host in URL '" + nexturlString + "' has loopback IP address. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -283,7 +283,7 @@ public final class plasmaCrawlStacker { // check blacklist if (plasmaSwitchboard.urlBlacklist.isListed(nexturl)) { - reason = "denied_(url_in_blacklist)"; + reason = plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST; this.log.logFine("URL '" + nexturlString + "' is in blacklist. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -291,7 +291,7 @@ public final class plasmaCrawlStacker { // filter deny if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) { - reason = "denied_(does_not_match_filter)"; + reason = plasmaCrawlEURL.DENIED_URL_DOES_NOT_MATCH_FILTER; /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ @@ -302,7 +302,7 @@ public final class plasmaCrawlStacker { // deny cgi if (plasmaHTCache.isCGI(nexturlString)) { - reason = "denied_(cgi_url)"; + reason = plasmaCrawlEURL.DENIED_CGI_URL; /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ @@ -313,7 +313,7 @@ public final class plasmaCrawlStacker { // deny post properties if ((plasmaHTCache.isPOST(nexturlString)) && (profile != null) && (!(profile.crawlingQ()))) { - reason = "denied_(post_url)"; + reason = plasmaCrawlEURL.DENIED_POST_URL; /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ @@ -329,7 +329,7 @@ public final class plasmaCrawlStacker { // deny urls that do not match with the profile domain list if (!(profile.grantedDomAppearance(nexturl.getHost()))) { - reason = "denied_(no_match_with_domain_filter)"; + reason = plasmaCrawlEURL.DENIED_NO_MATCH_WITH_DOMAIN_FILTER; this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -337,7 +337,7 @@ public final class plasmaCrawlStacker { // deny urls that exceed allowed number of occurrences if (!(profile.grantedDomCount(nexturl.getHost()))) { - reason = "denied_(domain_count_exceeded)"; + reason = plasmaCrawlEURL.DENIED_DOMAIN_COUNT_EXCEEDED; this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+ "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -352,7 +352,7 @@ public final class plasmaCrawlStacker { boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); if ((dbocc != null) && (!(recrawl))) { - reason = "double_(registered_in_" + dbocc + ")"; + reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")"; /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ @@ -363,7 +363,7 @@ public final class plasmaCrawlStacker { // checking robots.txt if (robotsParser.isDisallowed(nexturl)) { - reason = "denied_(robots.txt)"; + reason = plasmaCrawlEURL.DENIED_ROBOTS_TXT; /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index 9fc3ffedd..86b469cfe 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -281,7 +281,7 @@ public final class plasmaCrawlWorker extends Thread { private static plasmaHTCache.Entry load( URL url, String name, - String referer, + String refererURLString, String initiator, int depth, plasmaCrawlProfile.entry profile, @@ -308,29 +308,19 @@ public final class plasmaCrawlWorker extends Thread { boolean ssl = url.getProtocol().equals("https"); if (port < 0) port = (ssl) ? 443 : 80; + refererURLString = (refererURLString == null) ? "" : refererURLString.trim(); + // check if url is in blacklist String hostlow = host.toLowerCase(); if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) { log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist."); - plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry( - url, - referer, - initiator, - yacyCore.seedDB.mySeed.hash, - name, - "denied_(url_in_blacklist)", - new bitfield(indexURL.urlFlagLength) - ); - ee.store(); - sb.urlPool.errorURL.stackPushEntry(ee); + addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST, new bitfield(indexURL.urlFlagLength)); return null; } // TODO: resolve yacy and yacyh domains //String yAddress = yacyCore.seedDB.resolveYacyAddress(host); - referer = (referer == null) ? "" : referer.trim(); - // take a file from the net httpc remote = null; plasmaHTCache.Entry htCache = null; @@ -338,7 +328,7 @@ public final class plasmaCrawlWorker extends Thread { // create a request header httpHeader requestHeader = new httpHeader(); requestHeader.put(httpHeader.USER_AGENT, httpdProxyHandler.crawlerUserAgent); - requestHeader.put(httpHeader.REFERER, referer); + requestHeader.put(httpHeader.REFERER, refererURLString); requestHeader.put(httpHeader.ACCEPT_LANGUAGE, sb.getConfig("crawler.acceptLanguage","en-us,en;q=0.5")); requestHeader.put(httpHeader.ACCEPT_CHARSET, sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7")); if (useContentEncodingGzip) requestHeader.put(httpHeader.ACCEPT_ENCODING, "gzip,deflate"); @@ -358,15 +348,15 @@ public final class plasmaCrawlWorker extends Thread { if (res.status.startsWith("200") || res.status.startsWith("203")) { // the transfer is ok - - // TODO: aborting download if content is to long ... - //long contentLength = res.responseHeader.contentLength(); - + + // create a new cache entry htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile); + + // aborting download if content is to long ... if (htCache.cacheFile.getAbsolutePath().length() > serverSystem.maxPathLength) { remote.close(); - log.logInfo("REJECTED URL " + url.toString() + " because path too long '" + - cacheManager.cachePath.getAbsolutePath() + "'"); + log.logInfo("REJECTED URL " + url.toString() + " because path too long '" + cacheManager.cachePath.getAbsolutePath() + "'"); + addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_CACHEFILE_PATH_TOO_LONG, new bitfield(indexURL.urlFlagLength)); return (htCache = null); } @@ -377,6 +367,7 @@ public final class plasmaCrawlWorker extends Thread { log.logInfo("REJECTED URL " + url.toString() + " because of an invalid file path ('" + htCache.cacheFile.getCanonicalPath() + "' does not start with '" + cacheManager.cachePath.getAbsolutePath() + "')."); + addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_INVALID_CACHEFILE_PATH, new bitfield(indexURL.urlFlagLength)); return (htCache = null); } @@ -408,6 +399,7 @@ public final class plasmaCrawlWorker extends Thread { // if the response has not the right file type then reject file remote.close(); log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.responseHeader.mime() + " for URL " + url.toString()); + addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT, new bitfield(indexURL.urlFlagLength)); htCache = null; } } catch (SocketException e) { @@ -418,6 +410,7 @@ public final class plasmaCrawlWorker extends Thread { // and most possible corrupted if (cacheFile.exists()) cacheFile.delete(); log.logSevere("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString()); + addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_CONNECTION_ERROR, new bitfield(indexURL.urlFlagLength)); htCache = null; } } else if (res.status.startsWith("30")) { @@ -429,6 +422,7 @@ public final class plasmaCrawlWorker extends Thread { if (redirectionUrlString.length() == 0) { log.logWarning("CRAWLER Redirection of URL=" + url.toString() + " aborted. Location header is empty."); + addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_REDIRECTION_HEADER_EMPTY, new bitfield(indexURL.urlFlagLength)); return null; } @@ -449,6 +443,7 @@ public final class plasmaCrawlWorker extends Thread { // if we are already doing a shutdown we don't need to retry crawling if (Thread.currentThread().isInterrupted()) { log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown."); + addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_SERVER_SHUTDOWN, new bitfield(indexURL.urlFlagLength)); return null; } @@ -461,7 +456,7 @@ public final class plasmaCrawlWorker extends Thread { // retry crawling with new url plasmaHTCache.Entry redirectedEntry = load(redirectionUrl, name, - referer, + refererURLString, initiator, depth, profile, @@ -492,13 +487,14 @@ public final class plasmaCrawlWorker extends Thread { } } else { log.logInfo("Redirection counter exceeded for URL " + url.toString() + ". Processing aborted."); + addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_REDIRECTION_COUNTER_EXCEEDED, new bitfield(indexURL.urlFlagLength)); } }else { // if the response has not the right response type then reject file log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for URL " + url.toString()); - // not processed any further - // TODO: add the url into the error url DB + // not processed any further + addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_WRONG_HTTP_STATUSCODE + res.statusCode + ")", new bitfield(indexURL.urlFlagLength)); } if (remote != null) remote.close(); @@ -506,6 +502,7 @@ public final class plasmaCrawlWorker extends Thread { } catch (Exception e) { boolean retryCrawling = false; String errorMsg = e.getMessage(); + String failreason = null; if ((e instanceof IOException) && (errorMsg != null) && @@ -513,47 +510,61 @@ public final class plasmaCrawlWorker extends Thread { (Thread.currentThread().isInterrupted()) ) { log.logInfo("CRAWLER Interruption detected because of server shutdown."); + failreason = plasmaCrawlEURL.DENIED_SERVER_SHUTDOWN; } else if (e instanceof MalformedURLException) { log.logWarning("CRAWLER Malformed URL '" + url.toString() + "' detected. "); + failreason = plasmaCrawlEURL.DENIED_MALFORMED_URL; } else if (e instanceof NoRouteToHostException) { log.logWarning("CRAWLER No route to host found while trying to crawl URL '" + url.toString() + "'."); + failreason = plasmaCrawlEURL.DENIED_NO_ROUTE_TO_HOST; } else if ((e instanceof UnknownHostException) || ((errorMsg != null) && (errorMsg.indexOf("unknown host") >= 0))) { log.logWarning("CRAWLER Unknown host in URL '" + url.toString() + "'. " + - "Referer URL: " + ((referer == null) ?"Unknown":referer)); + "Referer URL: " + ((refererURLString == null) ?"Unknown":refererURLString)); + failreason = plasmaCrawlEURL.DENIED_UNKNOWN_HOST; } else if (e instanceof java.net.BindException) { log.logWarning("CRAWLER BindException detected while trying to download content from '" + url.toString() + "'. Retrying request."); + failreason = plasmaCrawlEURL.DENIED_CONNECTION_BIND_EXCEPTION; retryCrawling = true; } else if ((errorMsg != null) && (errorMsg.indexOf("Corrupt GZIP trailer") >= 0)) { log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + url.toString() + "'. Retrying request without using gzip content encoding."); + failreason = plasmaCrawlEURL.DENIED_CONTENT_DECODING_ERROR; retryCrawling = true; } else if ((errorMsg != null) && (errorMsg.indexOf("Read timed out") >= 0)) { log.logWarning("CRAWLER Read timeout while receiving content from '" + url.toString() + "'. Retrying request."); + failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT; retryCrawling = true; } else if ((errorMsg != null) && (errorMsg.indexOf("connect timed out") >= 0)) { log.logWarning("CRAWLER Timeout while trying to connect to '" + url.toString() + "'. Retrying request."); + failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT; retryCrawling = true; } else if ((errorMsg != null) && (errorMsg.indexOf("Connection timed out") >= 0)) { log.logWarning("CRAWLER Connection timeout while receiving content from '" + url.toString() + "'. Retrying request."); + failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT; retryCrawling = true; } else if ((errorMsg != null) && (errorMsg.indexOf("Connection refused") >= 0)) { log.logWarning("CRAWLER Connection refused while trying to connect to '" + url.toString() + "'."); + failreason = plasmaCrawlEURL.DENIED_CONNECTION_REFUSED; } else if ((errorMsg != null) && (errorMsg.indexOf("There is not enough space on the disk") >= 0)) { log.logSevere("CRAWLER Not enough space on the disk detected while crawling '" + url.toString() + "'. " + "Pausing crawlers. "); plasmaCrawlLoader.switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL); plasmaCrawlLoader.switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); + failreason = plasmaCrawlEURL.DENIED_OUT_OF_DISK_SPACE; } else if ((errorMsg != null) && (errorMsg.indexOf("Network is unreachable") >=0)) { log.logSevere("CRAWLER Network is unreachable while trying to crawl URL '" + url.toString() + "'. "); + failreason = plasmaCrawlEURL.DENIED_NETWORK_IS_UNREACHABLE; } else if ((errorMsg != null) && (errorMsg.indexOf("No trusted certificate found")>= 0)) { - log.logSevere("CRAWLER No trusted certificate found for URL '" + url.toString() + "'. "); + log.logSevere("CRAWLER No trusted certificate found for URL '" + url.toString() + "'. "); + failreason = plasmaCrawlEURL.DENIED_SSL_UNTRUSTED_CERT; } else { log.logSevere("CRAWLER Unexpected Error with URL '" + url.toString() + "': " + e.toString(),e); + failreason = plasmaCrawlEURL.DENIED_CONNECTION_ERROR; } if (retryCrawling) { @@ -573,7 +584,7 @@ public final class plasmaCrawlWorker extends Thread { // retry crawling return load(url, name, - referer, + refererURLString, initiator, depth, profile, @@ -586,10 +597,43 @@ public final class plasmaCrawlWorker extends Thread { false ); } + if (failreason != null) { + addURLtoErrorDB(url, refererURLString, initiator, name, failreason, new bitfield(indexURL.urlFlagLength)); + } return null; } finally { if (remote != null) httpc.returnInstance(remote); } } + + private static void addURLtoErrorDB( + URL url, + String referrerString, + String initiator, + String name, + String failreason, + bitfield flags + ) { + // getting a reference to the plasmaSwitchboard + plasmaSwitchboard sb = plasmaCrawlLoader.switchboard; + + // convert the referrer URL into a hash value + String referrerHash = (referrerString==null)?null:indexURL.urlHash(referrerString); + + // create a new errorURL DB entry + plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry( + url, + referrerHash, + initiator, + yacyCore.seedDB.mySeed.hash, + name, + failreason, + flags + ); + // store the entry + ee.store(); + // push it onto the stack + sb.urlPool.errorURL.stackPushEntry(ee); + } } \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index c58fa3144..cf31617d1 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1362,14 +1362,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser document = parser.parseSource(entry.url(), mimeType, entry.cacheFile()); } else { log.logFine("(Parser) '" + entry.normalizedURLString() + "' cannot be parsed, no resource available"); + addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength)); return; } if (document == null) { log.logSevere("(Parser) '" + entry.normalizedURLString() + "' parse failure"); + addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_PARSER_ERROR, new bitfield(indexURL.urlFlagLength)); return; } } else { log.logFine("(Parser) '" + entry.normalizedURLString() + "'. Unsupported mimeType '" + ((mimeType == null) ? "null" : mimeType) + "'."); + addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT, new bitfield(indexURL.urlFlagLength)); return; } parsingEndTime = System.currentTimeMillis(); @@ -1415,7 +1418,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser referrerHash = indexURL.urlHash(referrerURL); if (referrerHash == null) referrerHash = indexURL.dummyHash; - String noIndexReason = "unspecified"; + String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR; if (processCase == 4) { // proxy-load noIndexReason = entry.shallIndexCacheForProxy(); @@ -1558,22 +1561,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } else { log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase); + addURLtoErrorDB(entry.url(), referrerHash, initiatorHash, descr, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexURL.urlFlagLength)); } } catch (Exception ee) { log.logSevere("Could not index URL " + entry.url() + ": " + ee.getMessage(), ee); if ((processCase == 6) && (initiator != null)) { yacyClient.crawlReceipt(initiator, "crawl", "exception", ee.getMessage(), null, ""); } + addURLtoErrorDB(entry.url(), referrerHash, initiatorHash, descr, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexURL.urlFlagLength)); } } else { log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); - plasmaCrawlEURL.Entry ee = urlPool.errorURL.newEntry(entry.url(), referrerHash, - ((entry.proxy()) ? indexURL.dummyHash : entry.initiator()), - yacyCore.seedDB.mySeed.hash, - descr, noIndexReason, new bitfield(indexURL.urlFlagLength)); - ee.store(); - urlPool.errorURL.stackPushEntry(ee); + addURLtoErrorDB(entry.url(), referrerHash, initiatorHash, descr, noIndexReason, new bitfield(indexURL.urlFlagLength)); if ((processCase == 6) && (initiator != null)) { yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, ""); } @@ -1687,6 +1687,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return; } + // convert the referrer hash into the corresponding URL URL refererURL = null; String refererHash = urlEntry.referrerHash(); if ((refererHash != null) && (!refererHash.equals(indexURL.dummyHash))) try { @@ -2246,6 +2247,30 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } + private void addURLtoErrorDB( + URL url, + String referrerHash, + String initiator, + String name, + String failreason, + bitfield flags + ) { + // create a new errorURL DB entry + plasmaCrawlEURL.Entry ee = this.urlPool.errorURL.newEntry( + url, + referrerHash, + initiator, + yacyCore.seedDB.mySeed.hash, + name, + failreason, + flags + ); + // store the entry + ee.store(); + // push it onto the stack + this.urlPool.errorURL.stackPushEntry(ee); + } + public void terminate(long delay) { if (delay <= 0) throw new IllegalArgumentException("The shutdown delay must be greater than 0."); (new delayedShutdown(this,delay)).start(); diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index 14b43f216..410154057 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -334,6 +334,10 @@ public class plasmaSwitchboardQueue { } return referrerURL; } + + public String referrerHash() { + return referrerHash; + } public String anchorName() { return anchorName;