diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 1241c7188..3ed19ebf1 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -54,7 +54,8 @@ import com.ibm.icu.text.CharsetDetector; public class htmlParser extends AbstractParser implements Parser { private static final Pattern patternUnderline = Pattern.compile("_"); - private static final int maxLinks = 10000; + private final int maxLinks = 10000; + private Charset detectedcharset; public htmlParser() { super("Streaming HTML Parser"); @@ -97,7 +98,8 @@ public class htmlParser extends AbstractParser implements Parser { try { // first get a document from the parsed html final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks); - final Document document = transformScraper(location, mimeType, documentCharset, scraper); + // parseToScraper also detects/corrects/sets charset from html content tag + final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper); return new Document[]{document}; } catch (final IOException e) { @@ -153,7 +155,7 @@ public class htmlParser extends AbstractParser implements Parser { return ppd; } - public static ContentScraper parseToScraper( + public ContentScraper parseToScraper( final DigestURL location, final String documentCharset, InputStream sourceStream, @@ -191,23 +193,21 @@ public class htmlParser extends AbstractParser implements Parser { // wtf? still nothing, just take system-standard if (charset == null) { - charset = Charset.defaultCharset().name(); - } - - Charset c; - try { - c = Charset.forName(charset); - } catch (final IllegalCharsetNameException e) { - c = Charset.defaultCharset(); - } catch (final UnsupportedCharsetException e) { - c = Charset.defaultCharset(); + detectedcharset = Charset.defaultCharset(); + } else { + try { + detectedcharset = Charset.forName(charset); + } catch (final IllegalCharsetNameException e) { + detectedcharset = Charset.defaultCharset(); + } catch (final UnsupportedCharsetException e) { + detectedcharset = Charset.defaultCharset(); + } } - // parsing the content final ContentScraper scraper = new ContentScraper(location, maxLinks); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); try { - FileUtils.copy(sourceStream, writer, c); + FileUtils.copy(sourceStream, writer, detectedcharset); } catch (final IOException e) { throw new Parser.Failure("IO error:" + e.getMessage(), location); } finally {