diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java
index 1241c7188..3ed19ebf1 100644
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@@ -54,7 +54,8 @@ import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_");
- private static final int maxLinks = 10000;
+ private final int maxLinks = 10000;
+ private Charset detectedcharset;
public htmlParser() {
super("Streaming HTML Parser");
@@ -97,7 +98,8 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
- final Document document = transformScraper(location, mimeType, documentCharset, scraper);
+ // parseToScraper also detects/corrects/sets charset from html content tag
+ final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper);
return new Document[]{document};
} catch (final IOException e) {
@@ -153,7 +155,7 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd;
}
- public static ContentScraper parseToScraper(
+ public ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
InputStream sourceStream,
@@ -191,23 +193,21 @@ public class htmlParser extends AbstractParser implements Parser {
// wtf? still nothing, just take system-standard
if (charset == null) {
- charset = Charset.defaultCharset().name();
- }
-
- Charset c;
- try {
- c = Charset.forName(charset);
- } catch (final IllegalCharsetNameException e) {
- c = Charset.defaultCharset();
- } catch (final UnsupportedCharsetException e) {
- c = Charset.defaultCharset();
+ detectedcharset = Charset.defaultCharset();
+ } else {
+ try {
+ detectedcharset = Charset.forName(charset);
+ } catch (final IllegalCharsetNameException e) {
+ detectedcharset = Charset.defaultCharset();
+ } catch (final UnsupportedCharsetException e) {
+ detectedcharset = Charset.defaultCharset();
+ }
}
-
// parsing the content
final ContentScraper scraper = new ContentScraper(location, maxLinks);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
- FileUtils.copy(sourceStream, writer, c);
+ FileUtils.copy(sourceStream, writer, detectedcharset);
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {