diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 569a5b71a..62b0f1db0 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -35,6 +35,7 @@ import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyURL; import de.anomic.ymage.ymageImageParser; @@ -79,7 +80,12 @@ public class ViewImage { // getting the image as stream Image scaled = iconcache.get(urlString); if (scaled == null) { - final Object[] resource = plasmaSnippetCache.getResource(url, true, timeout, false, true); + Object[] resource = null; + try { + resource = plasmaSnippetCache.getResource(url, true, timeout, false, true); + } catch (IOException e) { + serverLog.logWarning("ViewImage", "cannot load: " + e.getMessage()); + } byte[] imgb = null; if (resource == null) { if (urlString.endsWith(".ico")) { diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 987197f33..0e60cf1d4 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -465,7 +465,7 @@ public class CrawlQueues { final boolean keepInMemory, final boolean forText, final boolean global - ) { + ) throws IOException { final CrawlEntry centry = new CrawlEntry( sb.webIndex.seedDB.mySeed().hash, diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index c0a5f0eed..8fea71b6e 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -380,7 +380,7 @@ public final class CrawlStacker extends Thread { // check if the protocol is supported final String urlProtocol = entry.url().getProtocol(); if (!sb.crawlQueues.isSupportedProtocol(urlProtocol)) { - reason = ErrorURL.DENIED_UNSUPPORTED_PROTOCOL; + reason = "unsupported protocol"; this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -396,7 +396,7 @@ public final class CrawlStacker extends Thread { // check blacklist if (plasmaSwitchboard.urlBlacklist.isListed(indexReferenceBlacklist.BLACKLIST_CRAWLER, entry.url())) { - reason = ErrorURL.DENIED_URL_IN_BLACKLIST; + reason = "url in blacklist"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is in blacklist. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -411,8 +411,7 @@ public final class CrawlStacker extends Thread { // filter deny if ((entry.depth() > 0) && (!(entry.url().toString().matches(profile.generalFilter())))) { - reason = ErrorURL.DENIED_URL_DOES_NOT_MATCH_FILTER; - + reason = "url does not match general filter"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match crawling filter '" + profile.generalFilter() + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -420,7 +419,7 @@ public final class CrawlStacker extends Thread { // deny cgi if (entry.url().isCGI()) { - reason = ErrorURL.DENIED_CGI_URL; + reason = "cgi url not allowed"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); @@ -429,7 +428,7 @@ public final class CrawlStacker extends Thread { // deny post properties if (entry.url().isPOST() && !(profile.crawlingQ())) { - reason = ErrorURL.DENIED_POST_URL; + reason = "post url not allowed"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); @@ -445,7 +444,7 @@ public final class CrawlStacker extends Thread { // deny urls that do not match with the profile domain list if (!(profile.grantedDomAppearance(entry.url().getHost()))) { - reason = ErrorURL.DENIED_NO_MATCH_WITH_DOMAIN_FILTER; + reason = "url does not match domain filter"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -453,7 +452,7 @@ public final class CrawlStacker extends Thread { // deny urls that exceed allowed number of occurrences if (!(profile.grantedDomCount(entry.url().getHost()))) { - reason = ErrorURL.DENIED_DOMAIN_COUNT_EXCEEDED; + reason = "domain counter exceeded"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+ "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; @@ -465,12 +464,12 @@ public final class CrawlStacker extends Thread { final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime()); // do double-check if ((dbocc != null) && (!recrawl)) { - reason = ErrorURL.DOUBLE_REGISTERED + dbocc + ")"; + reason = "double " + dbocc + ")"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; } if ((oldEntry != null) && (!recrawl)) { - reason = ErrorURL.DOUBLE_REGISTERED + "LURL)"; + reason = "double " + "LURL)"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; } diff --git a/source/de/anomic/crawler/ErrorURL.java b/source/de/anomic/crawler/ErrorURL.java deleted file mode 100644 index 50d00b71b..000000000 --- a/source/de/anomic/crawler/ErrorURL.java +++ /dev/null @@ -1,94 +0,0 @@ -// plasmaCrawlEURL.java -// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 09.08.2004 on http://www.anomic.de -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.crawler; - -public class ErrorURL { - - /* ======================================================================= - * Failure reason constants - * ======================================================================= */ - - // invalid urls - public static final String DENIED_URL_NULL = "denied_(url_null)"; - public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)"; - public static final String DENIED_UNSUPPORTED_PROTOCOL = "denied_(unsupported_protocol)"; - public static final String DENIED_LOOPBACK_IP_ADDRESS = "denied_(loopback_ip_address)"; - public static final String DENIED_CACHEFILE_PATH_TOO_LONG = "denied_(cachefile_path_too_long)"; - public static final String DENIED_INVALID_CACHEFILE_PATH = "denied_(invalid_cachefile_path)"; - - // blacklisted/blocked urls - public static final String DENIED_URL_IN_BLACKLIST = "denied_(url_in_blacklist)"; - public static final String DENIED_URL_DOES_NOT_MATCH_FILTER = "denied_(does_not_match_filter)"; - public static final String DENIED_CGI_URL = "denied_(cgi_url)"; - public static final String DENIED_POST_URL = "denied_(post_url)"; - public static final String DENIED_NO_MATCH_WITH_DOMAIN_FILTER = "denied_(no_match_with_domain_filter)"; - public static final String DENIED_DOMAIN_COUNT_EXCEEDED = "denied_(domain_count_exceeded)"; - public static final String DENIED_ROBOTS_TXT = "denied_(robots.txt)"; - - // wrong content - public static final String DENIED_WRONG_MIMETYPE_OR_EXT = "denied_(wrong_mimetype_or_extension)"; - public static final String DENIED_UNSUPPORTED_CHARSET = "denied_(unsupported_charset)"; - public static final String DENIED_REDIRECTION_HEADER_EMPTY = "denied_(redirection_header_empty)"; - public static final String DENIED_REDIRECTION_COUNTER_EXCEEDED = "denied_(redirection_counter_exceeded)"; - public static final String DENIED_REDIRECTION_TO_DOUBLE_CONTENT = "denied_(redirection_to_double_content)"; - public static final String DENIED_WRONG_HTTP_STATUSCODE = "denied_(wrong_http_status_code_"; - public static final String DENIED_CONTENT_DECODING_ERROR = "denied_(content_decoding_error)"; - public static final String DENIED_FILESIZE_LIMIT_EXCEEDED = "denied_(filesize_limit_exceeded)"; - public static final String DENIED_FILESIZE_UNKNOWN = "denied_(filesize_unknown)"; - - // network errors - public static final String DENIED_UNKNOWN_HOST = "denied_(unknown_host)"; - public static final String DENIED_NO_ROUTE_TO_HOST = "denied_(no_route_to_host)"; - public static final String DENIED_NETWORK_IS_UNREACHABLE = "denied_(Network_is_unreachable)"; - - // connection errors - public static final String DENIED_CONNECTION_ERROR = "denied_(connection_error)"; - public static final String DENIED_CONNECTION_BIND_EXCEPTION = "denied_(connection_bind_exception)"; - public static final String DENIED_CONNECTION_TIMEOUT = "denied_(connection_timeout)"; - public static final String DENIED_CONNECTION_REFUSED = "denied_(connection_refused)"; - public static final String DENIED_SSL_UNTRUSTED_CERT = "denied_(No_trusted_ssl_certificate_found)"; - - // double registered errors - public static final String DOUBLE_REGISTERED = "double_(registered_in_"; - - // server errors - public static final String DENIED_OUT_OF_DISK_SPACE = "denied_(out_of_disk_space)"; - public static final String DENIED_SERVER_SHUTDOWN = "denied_(server_shutdown)"; - public static final String DENIED_SERVER_LOGIN_FAILED = "denied_(server_login_failed)"; - public static final String DENIED_SERVER_TRASFER_MODE_PROBLEM = "denied_(server_transfermode_problem)"; - public static final String DENIED_SERVER_DOWNLOAD_ERROR = "denied_(server_download_error)"; - - // Parser errors - public static final String DENIED_PARSER_ERROR = "denied_(parser_error)"; - public static final String DENIED_DOCUMENT_ENCRYPTED = "denied_(document_encrypted)"; - public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)"; - - // indexing errors - public static final String DENIED_UNSPECIFIED_INDEXING_ERROR = "denied_(unspecified_indexing_error)"; - public static final String DENIED_UNKNOWN_INDEXING_PROCESS_CASE = "denied_(unknown_indexing_process_case)"; - -} diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java index 35a000d02..48c09073a 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/FTPLoader.java @@ -104,7 +104,7 @@ public class FTPLoader { if (openConnection(ftpClient, entryUrl)) { // ftp stuff - try { + //try { // testing if the specified file is a directory if (file.length() > 0) { ftpClient.exec("cd \"" + path + "\"", false); @@ -133,9 +133,12 @@ public class FTPLoader { (new PrintStream(berr)).print(e.getMessage()); } } + /* } finally { closeConnection(ftpClient); } + */ + closeConnection(ftpClient); } // pass the downloaded resource to the cache manager @@ -143,7 +146,7 @@ public class FTPLoader { // some error logging final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : ""; log.logWarning("Unable to download URL " + entry.url().toString() + detail); - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_SERVER_DOWNLOAD_ERROR); + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "server download" + detail); } return htCache; @@ -239,14 +242,13 @@ public class FTPLoader { htCache.setCacheArray(b); } else { log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + entry.url().toString()); - sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1, - ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED); + sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "file size limit exceeded"); throw new Exception("file size exceeds limit"); } } else { // if the response has not the right file type then reject file log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + entry.url().toString()); - sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_WRONG_MIMETYPE_OR_EXT); + sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension"); throw new Exception("response has not the right file type -> rejected"); } return htCache; diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 6d91ce654..e8483a1cc 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -26,10 +26,6 @@ package de.anomic.crawler; import java.io.IOException; -import java.net.MalformedURLException; -import java.net.NoRouteToHostException; -import java.net.SocketException; -import java.net.UnknownHostException; import java.util.Date; import de.anomic.http.HttpClient; @@ -37,14 +33,12 @@ import de.anomic.http.JakartaCommonsHttpClient; import de.anomic.http.JakartaCommonsHttpResponse; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; -import de.anomic.http.httpdLimitExceededException; import de.anomic.http.httpdProxyCacheEntry; import de.anomic.index.indexDocumentMetadata; import de.anomic.index.indexReferenceBlacklist; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaSwitchboardConstants; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyURL; @@ -105,16 +99,15 @@ public final class HTTPLoader { return metadata; } - public indexDocumentMetadata load(final CrawlEntry entry, final String parserMode) { + public indexDocumentMetadata load(final CrawlEntry entry, final String parserMode) throws IOException { return load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT); } - private indexDocumentMetadata load(final CrawlEntry entry, final String parserMode, final int retryCount) { + private indexDocumentMetadata load(final CrawlEntry entry, final String parserMode, final int retryCount) throws IOException { if (retryCount < 0) { - this.log.logInfo("Redirection counter exceeded for URL " + entry.url().toString() + ". Processing aborted."); - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_REDIRECTION_COUNTER_EXCEEDED).store(); - return null; + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "redirection counter exceeded").store(); + throw new IOException("Redirection counter exceeded for URL " + entry.url().toString() + ". Processing aborted."); } final Date requestDate = new Date(); // remember the time... @@ -127,15 +120,14 @@ public final class HTTPLoader { // check if url is in blacklist final String hostlow = host.toLowerCase(); if (plasmaSwitchboard.urlBlacklist.isListed(indexReferenceBlacklist.BLACKLIST_CRAWLER, hostlow, path)) { - this.log.logInfo("CRAWLER Rejecting URL '" + entry.url().toString() + "'. URL is in blacklist."); - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_URL_IN_BLACKLIST).store(); - return null; + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "url in blacklist").store(); + throw new IOException("CRAWLER Rejecting URL '" + entry.url().toString() + "'. URL is in blacklist."); } // take a file from the net indexDocumentMetadata htCache = null; final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE); - try { + //try { // create a request header final httpRequestHeader requestHeader = new httpRequestHeader(); requestHeader.put(httpRequestHeader.USER_AGENT, crawlerUserAgent); @@ -150,7 +142,7 @@ public final class HTTPLoader { final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(socketTimeout, requestHeader); JakartaCommonsHttpResponse res = null; - try { + //try { // send request res = client.GET(entry.url().toString()); @@ -161,15 +153,14 @@ public final class HTTPLoader { htCache = createCacheEntry(entry, requestDate, requestHeader, res.getResponseHeader(), res.getStatusLine()); // request has been placed and result has been returned. work off response - try { + //try { if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) { // get the content length and check if the length is allowed long contentLength = res.getResponseHeader().getContentLength(); if (maxFileSize >= 0 && contentLength > maxFileSize) { - this.log.logInfo("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes."); - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED); - return null; + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "file size limit exceeded"); + throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes."); } // we write the new cache entry to file system directly @@ -179,19 +170,19 @@ public final class HTTPLoader { // check length again in case it was not possible to get the length before loading if (maxFileSize >= 0 && contentLength > maxFileSize) { - this.log.logInfo("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes."); - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED); - return null; + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "file size limit exceeded"); + throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes."); } htCache.setCacheArray(responseBody); } else { // if the response has not the right file type then reject file this.log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_WRONG_MIMETYPE_OR_EXT); + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension"); htCache = null; } return htCache; + /* } catch (final SocketException e) { // this may happen if the client suddenly closes its connection // maybe the user has stopped loading @@ -201,7 +192,7 @@ public final class HTTPLoader { this.log.logSevere("CRAWLER LOADER ERROR1: with URL=" + entry.url().toString() + ": " + e.toString()); sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_CONNECTION_ERROR); htCache = null; - } + }*/ } else if (res.getStatusLine().startsWith("30")) { if (res.getResponseHeader().containsKey(httpRequestHeader.LOCATION)) { // getting redirection URL @@ -209,9 +200,8 @@ public final class HTTPLoader { redirectionUrlString = redirectionUrlString.trim(); if (redirectionUrlString.length() == 0) { - this.log.logWarning("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty."); - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_REDIRECTION_HEADER_EMPTY); - return null; + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "redirection header empy"); + throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty."); } // normalizing URL @@ -223,9 +213,8 @@ public final class HTTPLoader { // if we are already doing a shutdown we don't need to retry crawling if (Thread.currentThread().isInterrupted()) { - this.log.logSevere("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown."); - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_SERVER_SHUTDOWN); - return null; + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "server shutdown"); + throw new IOException("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown."); } // generating url hash @@ -234,9 +223,8 @@ public final class HTTPLoader { // check if the url was already indexed final String dbname = sb.urlExists(urlhash); if (dbname != null) { - this.log.logWarning("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname); - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_REDIRECTION_TO_DOUBLE_CONTENT); - return null; + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "redirection to double content"); + throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname); } // retry crawling with new url @@ -248,16 +236,17 @@ public final class HTTPLoader { this.log.logInfo("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString()); // not processed any further - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_WRONG_HTTP_STATUSCODE + res.getStatusCode() + ")"); + sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")"); } - + /* } finally { if(res != null) { // release connection res.closeStream(); } - } + }*/ return htCache; + /* } catch (final Exception e) { final String errorMsg = e.getMessage(); String failreason = null; @@ -340,7 +329,7 @@ public final class HTTPLoader { sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, failreason); } return null; - } + }*/ } } diff --git a/source/de/anomic/crawler/ProtocolLoader.java b/source/de/anomic/crawler/ProtocolLoader.java index 5be6efdfa..ff142ab1d 100644 --- a/source/de/anomic/crawler/ProtocolLoader.java +++ b/source/de/anomic/crawler/ProtocolLoader.java @@ -26,6 +26,7 @@ package de.anomic.crawler; +import java.io.IOException; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; @@ -68,14 +69,14 @@ public final class ProtocolLoader { return (HashSet) this.supportedProtocols.clone(); } - public indexDocumentMetadata load(final CrawlEntry entry, final String parserMode) { - // getting the protocol of the next URL + public indexDocumentMetadata load(final CrawlEntry entry, final String parserMode) throws IOException { + // getting the protocol of the next URL final String protocol = entry.url().getProtocol(); final String host = entry.url().getHost(); // check if this loads a page from localhost, which must be prevented to protect the server // against attacks to the administration interface when localhost access is granted - if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) return null; + if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + entry.url()); // check access time if (!entry.url().isLocal()) { @@ -102,8 +103,7 @@ public final class ProtocolLoader { if ((protocol.equals("http") || (protocol.equals("https")))) return httpLoader.load(entry, parserMode); if (protocol.equals("ftp")) return ftpLoader.load(entry); - this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + entry.url()); - return null; + throw new IOException("Unsupported protocol '" + protocol + "' in url " + entry.url()); } public String process(final CrawlEntry entry, final String parserMode) { @@ -112,13 +112,14 @@ public final class ProtocolLoader { indexDocumentMetadata h; try { h = load(entry, parserMode); + assert h != null; entry.setStatus("loaded"); - if (h == null) return "load failed"; final boolean stored = sb.htEntryStoreProcess(h); entry.setStatus("stored-" + ((stored) ? "ok" : "fail")); return (stored) ? null : "not stored"; - } catch (final Exception e) { - log.logWarning("problem loading " + entry.url().toString(), e); + } catch (IOException e) { + entry.setStatus("error"); + log.logWarning("problem loading " + entry.url().toString()); return "load error - " + e.getMessage(); } } diff --git a/source/de/anomic/plasma/parser/ParserException.java b/source/de/anomic/plasma/parser/ParserException.java index c2ed6e4ef..e2cd81dd0 100644 --- a/source/de/anomic/plasma/parser/ParserException.java +++ b/source/de/anomic/plasma/parser/ParserException.java @@ -24,7 +24,6 @@ package de.anomic.plasma.parser; -import de.anomic.crawler.ErrorURL; import de.anomic.yacy.yacyURL; public class ParserException extends Exception @@ -39,7 +38,7 @@ public class ParserException extends Exception } public ParserException(final String message, final yacyURL url) { - this(message,url,ErrorURL.DENIED_PARSER_ERROR); + this(message,url, "parser error for url " + url.toString()); } public ParserException(final String message, final yacyURL url, final String errorCode) { @@ -49,7 +48,7 @@ public class ParserException extends Exception } public ParserException(final String message, final yacyURL url, final Throwable cause) { - this(message,url,cause,ErrorURL.DENIED_PARSER_ERROR); + this(message,url,cause, "parser error for url " + url.toString()); } public ParserException(final String message, final yacyURL url, final Throwable cause, final String errorCode) { diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 7996231eb..04c3ffeac 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -39,7 +39,6 @@ import org.pdfbox.pdmodel.encryption.AccessPermission; import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.pdfbox.util.PDFTextStripper; -import de.anomic.crawler.ErrorURL; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; @@ -107,7 +106,7 @@ public class pdfParser extends AbstractParser implements Parser { theDocument.openProtection(new StandardDecryptionMaterial("")); final AccessPermission perm = theDocument.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) - throw new ParserException("Document is encrypted",location,ErrorURL.DENIED_DOCUMENT_ENCRYPTED); + throw new ParserException("Document is encrypted",location, "document is exncrypted"); } // extracting some metadata diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 101aa1267..b4883828e 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -52,7 +52,6 @@ import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.anomic.crawler.ErrorURL; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterInputStream; @@ -546,7 +545,7 @@ public final class plasmaParser { if (sourceArray == null || sourceArray.length == 0) { final String errorMsg = "No resource content available (1)."; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location,ErrorURL.DENIED_NOT_PARSEABLE_NO_CONTENT); + throw new ParserException(errorMsg,location, "document has no content"); } // creating an InputStream @@ -580,7 +579,7 @@ public final class plasmaParser { if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2)."; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location,ErrorURL.DENIED_NOT_PARSEABLE_NO_CONTENT); + throw new ParserException(errorMsg,location, "document has no content"); } // create a new InputStream @@ -634,7 +633,7 @@ public final class plasmaParser { if (!plasmaParser.supportedContent(location,mimeType)) { final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location,ErrorURL.DENIED_WRONG_MIMETYPE_OR_EXT); + throw new ParserException(errorMsg,location, "wrong mime type or wrong extension"); } if (this.theLogger.isFine()) @@ -656,7 +655,7 @@ public final class plasmaParser { } else { final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location,ErrorURL.DENIED_WRONG_MIMETYPE_OR_EXT); + throw new ParserException(errorMsg,location, "wrong mime type or wrong extension"); } // check result @@ -668,9 +667,9 @@ public final class plasmaParser { return doc; } catch (final UnsupportedEncodingException e) { - final String errorMsg = "Unsupported charset encoding: " + e.getMessage(); + final String errorMsg = "unsupported charset encoding: " + e.getMessage(); this.theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); - throw new ParserException(errorMsg,location,ErrorURL.DENIED_UNSUPPORTED_CHARSET); + throw new ParserException(errorMsg,location, errorMsg); } catch (final Exception e) { // Interrupted- and Parser-Exceptions should pass through if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index e937c4295..656674888 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -26,7 +26,6 @@ package de.anomic.plasma; -import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index 5fbe55b64..4cad58c5b 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -22,6 +22,7 @@ package de.anomic.plasma; +import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.util.HashMap; @@ -31,6 +32,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.plasma.parser.ParserException; import de.anomic.server.serverDate; +import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyURL; public final class plasmaSearchImages { @@ -41,7 +43,13 @@ public final class plasmaSearchImages { final long start = System.currentTimeMillis(); this.images = new HashMap(); if (maxTime > 10) { - final Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false, indexing); + Object[] resource = null; + try { + resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false, indexing); + } catch (IOException e) { + serverLog.logWarning("ViewImage", "cannot load: " + e.getMessage()); + } + if (resource == null) return; final InputStream res = (InputStream) resource[0]; final Long resLength = (Long) resource[1]; if (res != null) { @@ -51,6 +59,7 @@ public final class plasmaSearchImages { document = plasmaSnippetCache.parseDocument(url, resLength.longValue(), res); } catch (final ParserException e) { // parsing failed + serverLog.logWarning("ViewImage", "cannot parse: " + e.getMessage()); } finally { try { res.close(); } catch (final Exception e) {/* ignore this */} } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index f7f37e3d1..2144a6e22 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -25,6 +25,7 @@ package de.anomic.plasma; import java.io.ByteArrayInputStream; +import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Enumeration; @@ -906,8 +907,9 @@ public class plasmaSnippetCache { * [0]the content as {@link InputStream} * [1]the content-length as {@link Integer} * + * @throws IOException */ - public static Object[] getResource(final yacyURL url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) { + public static Object[] getResource(final yacyURL url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException { // load the url as resource from the web long contentLength = -1; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index f53c6ccf5..c984097aa 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -112,7 +112,6 @@ import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.CrawlStacker; -import de.anomic.crawler.ErrorURL; import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.ImporterManager; import de.anomic.crawler.IndexingStack; @@ -1229,7 +1228,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch