*) httpdFileHandler.java:

no stacktrace will be printed into log file for "Connection timed out" Errors now
   See: http://www.yacy-forum.de/viewtopic.php?p=6381

*) plasmaCrawlWorker.java:
   If a "Read timed out" error occurs while crawling a site, the failed crawl will be
   retried.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@493 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent bae369a7da
commit 1d83d7e4d7

@ -505,19 +505,24 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http
StringBuffer errorMessage = new StringBuffer(); StringBuffer errorMessage = new StringBuffer();
Exception errorExc = null; Exception errorExc = null;
if (e instanceof InterruptedException) { String errorMsg = e.getMessage();
if (
(e instanceof InterruptedException) ||
((errorMsg != null) && (errorMsg.startsWith("Socket closed")) && (Thread.currentThread().isInterrupted()))
) {
errorMessage.append("Interruption detected while processing query."); errorMessage.append("Interruption detected while processing query.");
httpStatusCode = 503; httpStatusCode = 503;
} else { } else {
String errorMsg = e.getMessage();
if ((errorMsg != null) && if ((errorMsg != null) &&
( (
errorMsg.startsWith("Broken pipe") || errorMsg.startsWith("Broken pipe") ||
errorMsg.startsWith("Connection reset") || errorMsg.startsWith("Connection reset") ||
errorMsg.startsWith("Software caused connection abort") errorMsg.startsWith("Software caused connection abort")
)) { )) {
// client closed the connection, so we just end silently // client closed the connection, so we just end silently
errorMessage.append("Client unexpectedly closed connection while processing query."); errorMessage.append("Client unexpectedly closed connection while processing query.");
} else if ((errorMsg != null) && (errorMsg.startsWith("Connection timed out"))) {
errorMessage.append("Connection timed out.");
} else { } else {
errorMessage.append("Unexpected error while processing query."); errorMessage.append("Unexpected error while processing query.");
httpStatusCode = 500; httpStatusCode = 500;
@ -525,9 +530,10 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http
} }
} }
errorMessage.append("\nQuery: ").append(path) errorMessage.append("\nSession: ").append(Thread.currentThread().getName())
.append("\nClient: ").append(conProp.getProperty(httpd.CONNECTION_PROP_CLIENTIP,"unknown")) .append("\nQuery: ").append(path)
.append("\nReason: ").append(e.toString()); .append("\nClient: ").append(conProp.getProperty(httpd.CONNECTION_PROP_CLIENTIP,"unknown"))
.append("\nReason: ").append(e.toString());
if (!conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { if (!conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) {
// sending back an error message to the client // sending back an error message to the client

@ -60,7 +60,8 @@ import de.anomic.server.logging.serverLog;
import de.anomic.server.logging.serverMiniLogFormatter; import de.anomic.server.logging.serverMiniLogFormatter;
public final class plasmaCrawlWorker extends Thread { public final class plasmaCrawlWorker extends Thread {
private static final int DEFAULT_CRAWLING_RETRY_COUNT = 5;
private static final String threadBaseName = "CrawlerWorker"; private static final String threadBaseName = "CrawlerWorker";
private final CrawlerPool myPool; private final CrawlerPool myPool;
@ -260,7 +261,7 @@ public final class plasmaCrawlWorker extends Thread {
remoteProxyUse, remoteProxyUse,
cacheManager, cacheManager,
log, log,
0, DEFAULT_CRAWLING_RETRY_COUNT,
true true
); );
} }
@ -278,10 +279,12 @@ public final class plasmaCrawlWorker extends Thread {
boolean remoteProxyUse, boolean remoteProxyUse,
plasmaHTCache cacheManager, plasmaHTCache cacheManager,
serverLog log, serverLog log,
int redirectionCount, int crawlingRetryCount,
boolean useContentEncodingGzip boolean useContentEncodingGzip
) throws IOException { ) throws IOException {
if (url == null) return; if (url == null) return;
if (crawlingRetryCount < 0) return;
Date requestDate = new Date(); // remember the time... Date requestDate = new Date(); // remember the time...
String host = url.getHost(); String host = url.getHost();
String path = url.getPath(); String path = url.getPath();
@ -358,7 +361,7 @@ public final class plasmaCrawlWorker extends Thread {
log.logError("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString()); log.logError("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString());
} }
} else if (res.status.startsWith("30")) { } else if (res.status.startsWith("30")) {
if (redirectionCount < 5) { if (crawlingRetryCount < 0) {
if (res.responseHeader.containsKey(httpHeader.LOCATION)) { if (res.responseHeader.containsKey(httpHeader.LOCATION)) {
// generating the new url // generating the new url
URL redirectionUrl = new URL(url, (String) res.responseHeader.get(httpHeader.LOCATION)); URL redirectionUrl = new URL(url, (String) res.responseHeader.get(httpHeader.LOCATION));
@ -382,7 +385,7 @@ public final class plasmaCrawlWorker extends Thread {
remoteProxyUse, remoteProxyUse,
cacheManager, cacheManager,
log, log,
++redirectionCount, --crawlingRetryCount,
useContentEncodingGzip useContentEncodingGzip
); );
} }
@ -396,24 +399,38 @@ public final class plasmaCrawlWorker extends Thread {
} }
if (remote != null) remote.close(); if (remote != null) remote.close();
} catch (Exception e) { } catch (Exception e) {
if ((e.getMessage() != null) && (e.getMessage().indexOf("Corrupt GZIP trailer") >= 0)) { boolean retryCrawling = false;
log.logWarning("Problems detected while receiving gzip encoded content from '" + url.toString() + String errorMsg = e.getMessage();
"'. Retrying request without using gzip content encoding."); if (errorMsg != null) {
load(url, if (errorMsg.indexOf("Corrupt GZIP trailer") >= 0) {
name, log.logWarning("Problems detected while receiving gzip encoded content from '" + url.toString() +
referer, "'. Retrying request without using gzip content encoding.");
initiator, retryCrawling = true;
depth, } else if (errorMsg.indexOf("Socket time-out: Read timed out") >= 0) {
profile, log.logWarning("Read timeout while receiving content from '" + url.toString() +
socketTimeout, "'. Retrying request.");
remoteProxyHost, retryCrawling = true;
remoteProxyPort, }
remoteProxyUse,
cacheManager, if (retryCrawling) {
log, load(url,
0, name,
false referer,
); initiator,
depth,
profile,
socketTimeout,
remoteProxyHost,
remoteProxyPort,
remoteProxyUse,
cacheManager,
log,
0,
false
);
} else {
log.logError("CRAWLER LOADER ERROR2 with URL=" + url.toString() + ": " + e.toString(),e);
}
} else { } else {
// this may happen if the targeted host does not exist or anything with the // this may happen if the targeted host does not exist or anything with the
// remote server was wrong. // remote server was wrong.

Loading…
Cancel
Save