From 1d83d7e4d73b530868a5aeb1af6971d067da1fb0 Mon Sep 17 00:00:00 2001 From: theli Date: Thu, 4 Aug 2005 11:05:04 +0000 Subject: [PATCH] *) httpdFileHandler.java: no stacktrace will be printed into log file for "Connection timed out" Errors now See: http://www.yacy-forum.de/viewtopic.php?p=6381 *) plasmaCrawlWorker.java: If a "Read timed out" error occurs while crawling a site, the failed crawl will be retried. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@493 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/http/httpdFileHandler.java | 18 ++++-- .../de/anomic/plasma/plasmaCrawlWorker.java | 63 ++++++++++++------- 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index ceb349c1d..a6c6d238d 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -505,19 +505,24 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http StringBuffer errorMessage = new StringBuffer(); Exception errorExc = null; - if (e instanceof InterruptedException) { + String errorMsg = e.getMessage(); + if ( + (e instanceof InterruptedException) || + ((errorMsg != null) && (errorMsg.startsWith("Socket closed")) && (Thread.currentThread().isInterrupted())) + ) { errorMessage.append("Interruption detected while processing query."); httpStatusCode = 503; } else { - String errorMsg = e.getMessage(); if ((errorMsg != null) && ( errorMsg.startsWith("Broken pipe") || errorMsg.startsWith("Connection reset") || - errorMsg.startsWith("Software caused connection abort") + errorMsg.startsWith("Software caused connection abort") )) { // client closed the connection, so we just end silently errorMessage.append("Client unexpectedly closed connection while processing query."); + } else if ((errorMsg != null) && (errorMsg.startsWith("Connection timed out"))) { + errorMessage.append("Connection timed out."); } else { errorMessage.append("Unexpected error while processing query."); httpStatusCode = 500; @@ -525,9 +530,10 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http } } - errorMessage.append("\nQuery: ").append(path) - .append("\nClient: ").append(conProp.getProperty(httpd.CONNECTION_PROP_CLIENTIP,"unknown")) - .append("\nReason: ").append(e.toString()); + errorMessage.append("\nSession: ").append(Thread.currentThread().getName()) + .append("\nQuery: ").append(path) + .append("\nClient: ").append(conProp.getProperty(httpd.CONNECTION_PROP_CLIENTIP,"unknown")) + .append("\nReason: ").append(e.toString()); if (!conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { // sending back an error message to the client diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index 8f3adfff9..110d60465 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -60,7 +60,8 @@ import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverMiniLogFormatter; public final class plasmaCrawlWorker extends Thread { - + + private static final int DEFAULT_CRAWLING_RETRY_COUNT = 5; private static final String threadBaseName = "CrawlerWorker"; private final CrawlerPool myPool; @@ -260,7 +261,7 @@ public final class plasmaCrawlWorker extends Thread { remoteProxyUse, cacheManager, log, - 0, + DEFAULT_CRAWLING_RETRY_COUNT, true ); } @@ -278,10 +279,12 @@ public final class plasmaCrawlWorker extends Thread { boolean remoteProxyUse, plasmaHTCache cacheManager, serverLog log, - int redirectionCount, + int crawlingRetryCount, boolean useContentEncodingGzip ) throws IOException { if (url == null) return; + if (crawlingRetryCount < 0) return; + Date requestDate = new Date(); // remember the time... String host = url.getHost(); String path = url.getPath(); @@ -358,7 +361,7 @@ public final class plasmaCrawlWorker extends Thread { log.logError("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString()); } } else if (res.status.startsWith("30")) { - if (redirectionCount < 5) { + if (crawlingRetryCount < 0) { if (res.responseHeader.containsKey(httpHeader.LOCATION)) { // generating the new url URL redirectionUrl = new URL(url, (String) res.responseHeader.get(httpHeader.LOCATION)); @@ -382,7 +385,7 @@ public final class plasmaCrawlWorker extends Thread { remoteProxyUse, cacheManager, log, - ++redirectionCount, + --crawlingRetryCount, useContentEncodingGzip ); } @@ -396,24 +399,38 @@ public final class plasmaCrawlWorker extends Thread { } if (remote != null) remote.close(); } catch (Exception e) { - if ((e.getMessage() != null) && (e.getMessage().indexOf("Corrupt GZIP trailer") >= 0)) { - log.logWarning("Problems detected while receiving gzip encoded content from '" + url.toString() + - "'. Retrying request without using gzip content encoding."); - load(url, - name, - referer, - initiator, - depth, - profile, - socketTimeout, - remoteProxyHost, - remoteProxyPort, - remoteProxyUse, - cacheManager, - log, - 0, - false - ); + boolean retryCrawling = false; + String errorMsg = e.getMessage(); + if (errorMsg != null) { + if (errorMsg.indexOf("Corrupt GZIP trailer") >= 0) { + log.logWarning("Problems detected while receiving gzip encoded content from '" + url.toString() + + "'. Retrying request without using gzip content encoding."); + retryCrawling = true; + } else if (errorMsg.indexOf("Socket time-out: Read timed out") >= 0) { + log.logWarning("Read timeout while receiving content from '" + url.toString() + + "'. Retrying request."); + retryCrawling = true; + } + + if (retryCrawling) { + load(url, + name, + referer, + initiator, + depth, + profile, + socketTimeout, + remoteProxyHost, + remoteProxyPort, + remoteProxyUse, + cacheManager, + log, + 0, + false + ); + } else { + log.logError("CRAWLER LOADER ERROR2 with URL=" + url.toString() + ": " + e.toString(),e); + } } else { // this may happen if the targeted host does not exist or anything with the // remote server was wrong.