From 22649408ad9ead0ab020e01e7baee5d5ab3d02bc Mon Sep 17 00:00:00 2001 From: theli Date: Tue, 10 Oct 2006 10:14:03 +0000 Subject: [PATCH] *) Better errorhandling for charset encoding problem during content parsing See: http://www.yacy-forum.de/viewtopic.php?t=2952 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2737 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaCrawlEURL.java | 1 + source/de/anomic/plasma/plasmaParser.java | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 8d9918354..f53dd0f55 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -86,6 +86,7 @@ public class plasmaCrawlEURL extends indexURL { // wrong content public static final String DENIED_WRONG_MIMETYPE_OR_EXT = "denied_(wrong_mimetype_or_extension)"; + public static final String DENIED_UNSUPPORTED_CHARSET = "denied_(unsupported_charset)"; public static final String DENIED_REDIRECTION_HEADER_EMPTY = "denied_(redirection_header_empty)"; public static final String DENIED_REDIRECTION_COUNTER_EXCEEDED = "denied_(redirection_counter_exceeded)"; public static final String DENIED_WRONG_HTTP_STATUSCODE = "denied_(wrong_http_status_code_"; diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 81404dc06..c11984b29 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -52,6 +52,7 @@ import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; import java.util.Arrays; @@ -600,6 +601,10 @@ public final class plasmaParser { } return doc; + } catch (UnsupportedEncodingException e) { + String errorMsg = "Unsupported charset encoding: " + e.getMessage(); + this.theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); + throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_UNSUPPORTED_CHARSET); } catch (Exception e) { // Interrupted- and Parser-Exceptions should pass through if (e instanceof InterruptedException) throw (InterruptedException) e;