added more exception handling during crawling

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5357 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 10f5ec1040
commit 1918a0173e

@ -28,6 +28,7 @@
package de.anomic.crawler; package de.anomic.crawler;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.Date; import java.util.Date;
@ -74,7 +75,7 @@ public class FTPLoader {
* @param entry * @param entry
* @return * @return
*/ */
public indexDocumentMetadata load(final CrawlEntry entry) { public indexDocumentMetadata load(final CrawlEntry entry) throws IOException {
final yacyURL entryUrl = entry.url(); final yacyURL entryUrl = entry.url();
final String fullPath = getPath(entryUrl); final String fullPath = getPath(entryUrl);
@ -133,11 +134,6 @@ public class FTPLoader {
(new PrintStream(berr)).print(e.getMessage()); (new PrintStream(berr)).print(e.getMessage());
} }
} }
/*
} finally {
closeConnection(ftpClient);
}
*/
closeConnection(ftpClient); closeConnection(ftpClient);
} }
@ -145,8 +141,8 @@ public class FTPLoader {
if (berr.size() > 0 || htCache == null) { if (berr.size() > 0 || htCache == null) {
// some error logging // some error logging
final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : ""; final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : "";
log.logWarning("Unable to download URL " + entry.url().toString() + detail);
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "server download" + detail); sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "server download" + detail);
throw new IOException("FTPLoader: Unable to download URL " + entry.url().toString() + detail);
} }
return htCache; return htCache;

@ -177,9 +177,8 @@ public final class HTTPLoader {
htCache.setCacheArray(responseBody); htCache.setCacheArray(responseBody);
} else { } else {
// if the response has not the right file type then reject file // if the response has not the right file type then reject file
this.log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension"); sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
htCache = null; throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
} }
return htCache; return htCache;
/* /*
@ -233,10 +232,8 @@ public final class HTTPLoader {
} }
} else { } else {
// if the response has not the right response type then reject file // if the response has not the right response type then reject file
this.log.logInfo("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
// not processed any further
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")"); sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
} }
/* /*
} finally { } finally {

Loading…
Cancel
Save