different handling of error cases that occur during loading files with http or ftp:

methods throw exception instead of returning an error string

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5328 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 538359a0ff
commit 674ad2d55b

@ -35,6 +35,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
import de.anomic.ymage.ymageImageParser;
@ -79,7 +80,12 @@ public class ViewImage {
// getting the image as stream
Image scaled = iconcache.get(urlString);
if (scaled == null) {
final Object[] resource = plasmaSnippetCache.getResource(url, true, timeout, false, true);
Object[] resource = null;
try {
resource = plasmaSnippetCache.getResource(url, true, timeout, false, true);
} catch (IOException e) {
serverLog.logWarning("ViewImage", "cannot load: " + e.getMessage());
}
byte[] imgb = null;
if (resource == null) {
if (urlString.endsWith(".ico")) {

@ -465,7 +465,7 @@ public class CrawlQueues {
final boolean keepInMemory,
final boolean forText,
final boolean global
) {
) throws IOException {
final CrawlEntry centry = new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,

@ -380,7 +380,7 @@ public final class CrawlStacker extends Thread {
// check if the protocol is supported
final String urlProtocol = entry.url().getProtocol();
if (!sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
reason = ErrorURL.DENIED_UNSUPPORTED_PROTOCOL;
reason = "unsupported protocol";
this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
@ -396,7 +396,7 @@ public final class CrawlStacker extends Thread {
// check blacklist
if (plasmaSwitchboard.urlBlacklist.isListed(indexReferenceBlacklist.BLACKLIST_CRAWLER, entry.url())) {
reason = ErrorURL.DENIED_URL_IN_BLACKLIST;
reason = "url in blacklist";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is in blacklist. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
@ -411,8 +411,7 @@ public final class CrawlStacker extends Thread {
// filter deny
if ((entry.depth() > 0) && (!(entry.url().toString().matches(profile.generalFilter())))) {
reason = ErrorURL.DENIED_URL_DOES_NOT_MATCH_FILTER;
reason = "url does not match general filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match crawling filter '" + profile.generalFilter() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
@ -420,7 +419,7 @@ public final class CrawlStacker extends Thread {
// deny cgi
if (entry.url().isCGI()) {
reason = ErrorURL.DENIED_CGI_URL;
reason = "cgi url not allowed";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
@ -429,7 +428,7 @@ public final class CrawlStacker extends Thread {
// deny post properties
if (entry.url().isPOST() && !(profile.crawlingQ())) {
reason = ErrorURL.DENIED_POST_URL;
reason = "post url not allowed";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
@ -445,7 +444,7 @@ public final class CrawlStacker extends Thread {
// deny urls that do not match with the profile domain list
if (!(profile.grantedDomAppearance(entry.url().getHost()))) {
reason = ErrorURL.DENIED_NO_MATCH_WITH_DOMAIN_FILTER;
reason = "url does not match domain filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
@ -453,7 +452,7 @@ public final class CrawlStacker extends Thread {
// deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(entry.url().getHost()))) {
reason = ErrorURL.DENIED_DOMAIN_COUNT_EXCEEDED;
reason = "domain counter exceeded";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
@ -465,12 +464,12 @@ public final class CrawlStacker extends Thread {
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check
if ((dbocc != null) && (!recrawl)) {
reason = ErrorURL.DOUBLE_REGISTERED + dbocc + ")";
reason = "double " + dbocc + ")";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
if ((oldEntry != null) && (!recrawl)) {
reason = ErrorURL.DOUBLE_REGISTERED + "LURL)";
reason = "double " + "LURL)";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}

@ -1,94 +0,0 @@
// plasmaCrawlEURL.java
// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 09.08.2004 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
public class ErrorURL {
/* =======================================================================
* Failure reason constants
* ======================================================================= */
// invalid urls
public static final String DENIED_URL_NULL = "denied_(url_null)";
public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)";
public static final String DENIED_UNSUPPORTED_PROTOCOL = "denied_(unsupported_protocol)";
public static final String DENIED_LOOPBACK_IP_ADDRESS = "denied_(loopback_ip_address)";
public static final String DENIED_CACHEFILE_PATH_TOO_LONG = "denied_(cachefile_path_too_long)";
public static final String DENIED_INVALID_CACHEFILE_PATH = "denied_(invalid_cachefile_path)";
// blacklisted/blocked urls
public static final String DENIED_URL_IN_BLACKLIST = "denied_(url_in_blacklist)";
public static final String DENIED_URL_DOES_NOT_MATCH_FILTER = "denied_(does_not_match_filter)";
public static final String DENIED_CGI_URL = "denied_(cgi_url)";
public static final String DENIED_POST_URL = "denied_(post_url)";
public static final String DENIED_NO_MATCH_WITH_DOMAIN_FILTER = "denied_(no_match_with_domain_filter)";
public static final String DENIED_DOMAIN_COUNT_EXCEEDED = "denied_(domain_count_exceeded)";
public static final String DENIED_ROBOTS_TXT = "denied_(robots.txt)";
// wrong content
public static final String DENIED_WRONG_MIMETYPE_OR_EXT = "denied_(wrong_mimetype_or_extension)";
public static final String DENIED_UNSUPPORTED_CHARSET = "denied_(unsupported_charset)";
public static final String DENIED_REDIRECTION_HEADER_EMPTY = "denied_(redirection_header_empty)";
public static final String DENIED_REDIRECTION_COUNTER_EXCEEDED = "denied_(redirection_counter_exceeded)";
public static final String DENIED_REDIRECTION_TO_DOUBLE_CONTENT = "denied_(redirection_to_double_content)";
public static final String DENIED_WRONG_HTTP_STATUSCODE = "denied_(wrong_http_status_code_";
public static final String DENIED_CONTENT_DECODING_ERROR = "denied_(content_decoding_error)";
public static final String DENIED_FILESIZE_LIMIT_EXCEEDED = "denied_(filesize_limit_exceeded)";
public static final String DENIED_FILESIZE_UNKNOWN = "denied_(filesize_unknown)";
// network errors
public static final String DENIED_UNKNOWN_HOST = "denied_(unknown_host)";
public static final String DENIED_NO_ROUTE_TO_HOST = "denied_(no_route_to_host)";
public static final String DENIED_NETWORK_IS_UNREACHABLE = "denied_(Network_is_unreachable)";
// connection errors
public static final String DENIED_CONNECTION_ERROR = "denied_(connection_error)";
public static final String DENIED_CONNECTION_BIND_EXCEPTION = "denied_(connection_bind_exception)";
public static final String DENIED_CONNECTION_TIMEOUT = "denied_(connection_timeout)";
public static final String DENIED_CONNECTION_REFUSED = "denied_(connection_refused)";
public static final String DENIED_SSL_UNTRUSTED_CERT = "denied_(No_trusted_ssl_certificate_found)";
// double registered errors
public static final String DOUBLE_REGISTERED = "double_(registered_in_";
// server errors
public static final String DENIED_OUT_OF_DISK_SPACE = "denied_(out_of_disk_space)";
public static final String DENIED_SERVER_SHUTDOWN = "denied_(server_shutdown)";
public static final String DENIED_SERVER_LOGIN_FAILED = "denied_(server_login_failed)";
public static final String DENIED_SERVER_TRASFER_MODE_PROBLEM = "denied_(server_transfermode_problem)";
public static final String DENIED_SERVER_DOWNLOAD_ERROR = "denied_(server_download_error)";
// Parser errors
public static final String DENIED_PARSER_ERROR = "denied_(parser_error)";
public static final String DENIED_DOCUMENT_ENCRYPTED = "denied_(document_encrypted)";
public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)";
// indexing errors
public static final String DENIED_UNSPECIFIED_INDEXING_ERROR = "denied_(unspecified_indexing_error)";
public static final String DENIED_UNKNOWN_INDEXING_PROCESS_CASE = "denied_(unknown_indexing_process_case)";
}

@ -104,7 +104,7 @@ public class FTPLoader {
if (openConnection(ftpClient, entryUrl)) {
// ftp stuff
try {
//try {
// testing if the specified file is a directory
if (file.length() > 0) {
ftpClient.exec("cd \"" + path + "\"", false);
@ -133,9 +133,12 @@ public class FTPLoader {
(new PrintStream(berr)).print(e.getMessage());
}
}
/*
} finally {
closeConnection(ftpClient);
}
*/
closeConnection(ftpClient);
}
// pass the downloaded resource to the cache manager
@ -143,7 +146,7 @@ public class FTPLoader {
// some error logging
final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : "";
log.logWarning("Unable to download URL " + entry.url().toString() + detail);
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_SERVER_DOWNLOAD_ERROR);
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "server download" + detail);
}
return htCache;
@ -239,14 +242,13 @@ public class FTPLoader {
htCache.setCacheArray(b);
} else {
log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1,
ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED);
sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new Exception("file size exceeds limit");
}
} else {
// if the response has not the right file type then reject file
log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_WRONG_MIMETYPE_OR_EXT);
sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
throw new Exception("response has not the right file type -> rejected");
}
return htCache;

@ -26,10 +26,6 @@
package de.anomic.crawler;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.NoRouteToHostException;
import java.net.SocketException;
import java.net.UnknownHostException;
import java.util.Date;
import de.anomic.http.HttpClient;
@ -37,14 +33,12 @@ import de.anomic.http.JakartaCommonsHttpClient;
import de.anomic.http.JakartaCommonsHttpResponse;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpdLimitExceededException;
import de.anomic.http.httpdProxyCacheEntry;
import de.anomic.index.indexDocumentMetadata;
import de.anomic.index.indexReferenceBlacklist;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
@ -105,16 +99,15 @@ public final class HTTPLoader {
return metadata;
}
public indexDocumentMetadata load(final CrawlEntry entry, final String parserMode) {
public indexDocumentMetadata load(final CrawlEntry entry, final String parserMode) throws IOException {
return load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT);
}
private indexDocumentMetadata load(final CrawlEntry entry, final String parserMode, final int retryCount) {
private indexDocumentMetadata load(final CrawlEntry entry, final String parserMode, final int retryCount) throws IOException {
if (retryCount < 0) {
this.log.logInfo("Redirection counter exceeded for URL " + entry.url().toString() + ". Processing aborted.");
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_REDIRECTION_COUNTER_EXCEEDED).store();
return null;
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "redirection counter exceeded").store();
throw new IOException("Redirection counter exceeded for URL " + entry.url().toString() + ". Processing aborted.");
}
final Date requestDate = new Date(); // remember the time...
@ -127,15 +120,14 @@ public final class HTTPLoader {
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (plasmaSwitchboard.urlBlacklist.isListed(indexReferenceBlacklist.BLACKLIST_CRAWLER, hostlow, path)) {
this.log.logInfo("CRAWLER Rejecting URL '" + entry.url().toString() + "'. URL is in blacklist.");
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_URL_IN_BLACKLIST).store();
return null;
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "url in blacklist").store();
throw new IOException("CRAWLER Rejecting URL '" + entry.url().toString() + "'. URL is in blacklist.");
}
// take a file from the net
indexDocumentMetadata htCache = null;
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE);
try {
//try {
// create a request header
final httpRequestHeader requestHeader = new httpRequestHeader();
requestHeader.put(httpRequestHeader.USER_AGENT, crawlerUserAgent);
@ -150,7 +142,7 @@ public final class HTTPLoader {
final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(socketTimeout, requestHeader);
JakartaCommonsHttpResponse res = null;
try {
//try {
// send request
res = client.GET(entry.url().toString());
@ -161,15 +153,14 @@ public final class HTTPLoader {
htCache = createCacheEntry(entry, requestDate, requestHeader, res.getResponseHeader(), res.getStatusLine());
// request has been placed and result has been returned. work off response
try {
//try {
if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) {
// get the content length and check if the length is allowed
long contentLength = res.getResponseHeader().getContentLength();
if (maxFileSize >= 0 && contentLength > maxFileSize) {
this.log.logInfo("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED);
return null;
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
}
// we write the new cache entry to file system directly
@ -179,19 +170,19 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
this.log.logInfo("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED);
return null;
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
}
htCache.setCacheArray(responseBody);
} else {
// if the response has not the right file type then reject file
this.log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_WRONG_MIMETYPE_OR_EXT);
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
htCache = null;
}
return htCache;
/*
} catch (final SocketException e) {
// this may happen if the client suddenly closes its connection
// maybe the user has stopped loading
@ -201,7 +192,7 @@ public final class HTTPLoader {
this.log.logSevere("CRAWLER LOADER ERROR1: with URL=" + entry.url().toString() + ": " + e.toString());
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_CONNECTION_ERROR);
htCache = null;
}
}*/
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(httpRequestHeader.LOCATION)) {
// getting redirection URL
@ -209,9 +200,8 @@ public final class HTTPLoader {
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
this.log.logWarning("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty.");
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_REDIRECTION_HEADER_EMPTY);
return null;
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "redirection header empy");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty.");
}
// normalizing URL
@ -223,9 +213,8 @@ public final class HTTPLoader {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
this.log.logSevere("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown.");
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_SERVER_SHUTDOWN);
return null;
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "server shutdown");
throw new IOException("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown.");
}
// generating url hash
@ -234,9 +223,8 @@ public final class HTTPLoader {
// check if the url was already indexed
final String dbname = sb.urlExists(urlhash);
if (dbname != null) {
this.log.logWarning("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname);
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_REDIRECTION_TO_DOUBLE_CONTENT);
return null;
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "redirection to double content");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname);
}
// retry crawling with new url
@ -248,16 +236,17 @@ public final class HTTPLoader {
this.log.logInfo("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
// not processed any further
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_WRONG_HTTP_STATUSCODE + res.getStatusCode() + ")");
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
}
/*
} finally {
if(res != null) {
// release connection
res.closeStream();
}
}
}*/
return htCache;
/*
} catch (final Exception e) {
final String errorMsg = e.getMessage();
String failreason = null;
@ -340,7 +329,7 @@ public final class HTTPLoader {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, failreason);
}
return null;
}
}*/
}
}

@ -26,6 +26,7 @@
package de.anomic.crawler;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
@ -68,14 +69,14 @@ public final class ProtocolLoader {
return (HashSet<String>) this.supportedProtocols.clone();
}
public indexDocumentMetadata load(final CrawlEntry entry, final String parserMode) {
// getting the protocol of the next URL
public indexDocumentMetadata load(final CrawlEntry entry, final String parserMode) throws IOException {
// getting the protocol of the next URL
final String protocol = entry.url().getProtocol();
final String host = entry.url().getHost();
// check if this loads a page from localhost, which must be prevented to protect the server
// against attacks to the administration interface when localhost access is granted
if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) return null;
if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + entry.url());
// check access time
if (!entry.url().isLocal()) {
@ -102,8 +103,7 @@ public final class ProtocolLoader {
if ((protocol.equals("http") || (protocol.equals("https")))) return httpLoader.load(entry, parserMode);
if (protocol.equals("ftp")) return ftpLoader.load(entry);
this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + entry.url());
return null;
throw new IOException("Unsupported protocol '" + protocol + "' in url " + entry.url());
}
public String process(final CrawlEntry entry, final String parserMode) {
@ -112,13 +112,14 @@ public final class ProtocolLoader {
indexDocumentMetadata h;
try {
h = load(entry, parserMode);
assert h != null;
entry.setStatus("loaded");
if (h == null) return "load failed";
final boolean stored = sb.htEntryStoreProcess(h);
entry.setStatus("stored-" + ((stored) ? "ok" : "fail"));
return (stored) ? null : "not stored";
} catch (final Exception e) {
log.logWarning("problem loading " + entry.url().toString(), e);
} catch (IOException e) {
entry.setStatus("error");
log.logWarning("problem loading " + entry.url().toString());
return "load error - " + e.getMessage();
}
}

@ -24,7 +24,6 @@
package de.anomic.plasma.parser;
import de.anomic.crawler.ErrorURL;
import de.anomic.yacy.yacyURL;
public class ParserException extends Exception
@ -39,7 +38,7 @@ public class ParserException extends Exception
}
public ParserException(final String message, final yacyURL url) {
this(message,url,ErrorURL.DENIED_PARSER_ERROR);
this(message,url, "parser error for url " + url.toString());
}
public ParserException(final String message, final yacyURL url, final String errorCode) {
@ -49,7 +48,7 @@ public class ParserException extends Exception
}
public ParserException(final String message, final yacyURL url, final Throwable cause) {
this(message,url,cause,ErrorURL.DENIED_PARSER_ERROR);
this(message,url,cause, "parser error for url " + url.toString());
}
public ParserException(final String message, final yacyURL url, final Throwable cause, final String errorCode) {

@ -39,7 +39,6 @@ import org.pdfbox.pdmodel.encryption.AccessPermission;
import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.pdfbox.util.PDFTextStripper;
import de.anomic.crawler.ErrorURL;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
@ -107,7 +106,7 @@ public class pdfParser extends AbstractParser implements Parser {
theDocument.openProtection(new StandardDecryptionMaterial(""));
final AccessPermission perm = theDocument.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new ParserException("Document is encrypted",location,ErrorURL.DENIED_DOCUMENT_ENCRYPTED);
throw new ParserException("Document is encrypted",location, "document is exncrypted");
}
// extracting some metadata

@ -52,7 +52,6 @@ import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.crawler.ErrorURL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.htmlFilter.htmlFilterInputStream;
@ -546,7 +545,7 @@ public final class plasmaParser {
if (sourceArray == null || sourceArray.length == 0) {
final String errorMsg = "No resource content available (1).";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,ErrorURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
throw new ParserException(errorMsg,location, "document has no content");
}
// creating an InputStream
@ -580,7 +579,7 @@ public final class plasmaParser {
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2).";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,ErrorURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
throw new ParserException(errorMsg,location, "document has no content");
}
// create a new InputStream
@ -634,7 +633,7 @@ public final class plasmaParser {
if (!plasmaParser.supportedContent(location,mimeType)) {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,ErrorURL.DENIED_WRONG_MIMETYPE_OR_EXT);
throw new ParserException(errorMsg,location, "wrong mime type or wrong extension");
}
if (this.theLogger.isFine())
@ -656,7 +655,7 @@ public final class plasmaParser {
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,ErrorURL.DENIED_WRONG_MIMETYPE_OR_EXT);
throw new ParserException(errorMsg,location, "wrong mime type or wrong extension");
}
// check result
@ -668,9 +667,9 @@ public final class plasmaParser {
return doc;
} catch (final UnsupportedEncodingException e) {
final String errorMsg = "Unsupported charset encoding: " + e.getMessage();
final String errorMsg = "unsupported charset encoding: " + e.getMessage();
this.theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
throw new ParserException(errorMsg,location,ErrorURL.DENIED_UNSUPPORTED_CHARSET);
throw new ParserException(errorMsg,location, errorMsg);
} catch (final Exception e) {
// Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -26,7 +26,6 @@
package de.anomic.plasma;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;

@ -22,6 +22,7 @@
package de.anomic.plasma;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.HashMap;
@ -31,6 +32,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverDate;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
public final class plasmaSearchImages {
@ -41,7 +43,13 @@ public final class plasmaSearchImages {
final long start = System.currentTimeMillis();
this.images = new HashMap<String, htmlFilterImageEntry>();
if (maxTime > 10) {
final Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false, indexing);
Object[] resource = null;
try {
resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false, indexing);
} catch (IOException e) {
serverLog.logWarning("ViewImage", "cannot load: " + e.getMessage());
}
if (resource == null) return;
final InputStream res = (InputStream) resource[0];
final Long resLength = (Long) resource[1];
if (res != null) {
@ -51,6 +59,7 @@ public final class plasmaSearchImages {
document = plasmaSnippetCache.parseDocument(url, resLength.longValue(), res);
} catch (final ParserException e) {
// parsing failed
serverLog.logWarning("ViewImage", "cannot parse: " + e.getMessage());
} finally {
try { res.close(); } catch (final Exception e) {/* ignore this */}
}

@ -25,6 +25,7 @@
package de.anomic.plasma;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
@ -906,8 +907,9 @@ public class plasmaSnippetCache {
* <tr><td>[0]</td><td>the content as {@link InputStream}</td></tr>
* <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
* </table>
* @throws IOException
*/
public static Object[] getResource(final yacyURL url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) {
public static Object[] getResource(final yacyURL url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
// load the url as resource from the web
long contentLength = -1;

@ -112,7 +112,6 @@ import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.ErrorURL;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.ImporterManager;
import de.anomic.crawler.IndexingStack;
@ -1229,7 +1228,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
}
// check if the document should be indexed
String noIndexReason = ErrorURL.DENIED_UNSPECIFIED_INDEXING_ERROR;
String noIndexReason = "unspecified indexing error";
if (queueEntry.processCase() == plasmaSwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) {
// proxy-load
noIndexReason = queueEntry.shallIndexCacheForProxy();
@ -1685,7 +1684,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// STORE WORD INDEX
if ((!queueEntry.profile().indexText()) && (!queueEntry.profile().indexMedia())) {
if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, ErrorURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
return;
}
@ -1768,7 +1767,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
InputStream resourceContent = null;
try {
// get the resource content
final Object[] resource = plasmaSnippetCache.getResource(comp.url(), fetchOnline, 10000, true, false);
Object[] resource = null;
try {
resource = plasmaSnippetCache.getResource(comp.url(), fetchOnline, 10000, true, false);
} catch (IOException e) {
serverLog.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
}
if (resource == null) {
// delete just the url entry
webIndex.removeURL(urlhash);

@ -38,6 +38,7 @@ import javax.imageio.ImageIO;
import de.anomic.index.indexDocumentMetadata;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
public class ymageOSM {
@ -79,7 +80,13 @@ public class ymageOSM {
InputStream tileStream = plasmaHTCache.getResourceContentStream(tileURL);
if (tileStream == null) {
// download resource using the crawler and keep resource in memory if possible
final indexDocumentMetadata entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(tileURL, 20000, true, false, false);
indexDocumentMetadata entry = null;
try {
entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(tileURL, 20000, true, false, false);
} catch (IOException e) {
serverLog.logWarning("yamyOSM", "cannot load: " + e.getMessage());
return null;
}
if ((entry == null) || (entry.cacheArray() == null)) return null;
tileStream = new ByteArrayInputStream(entry.cacheArray());
}

Loading…
Cancel
Save