make use of detected charset in htmlParser if none is given.

pull/1/head
reger 11 years ago
parent 71649bf22d
commit 49e76a1c55

@ -54,7 +54,8 @@ import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser { public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_"); private static final Pattern patternUnderline = Pattern.compile("_");
private static final int maxLinks = 10000; private final int maxLinks = 10000;
private Charset detectedcharset;
public htmlParser() { public htmlParser() {
super("Streaming HTML Parser"); super("Streaming HTML Parser");
@ -97,7 +98,8 @@ public class htmlParser extends AbstractParser implements Parser {
try { try {
// first get a document from the parsed html // first get a document from the parsed html
final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks); final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
final Document document = transformScraper(location, mimeType, documentCharset, scraper); // parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper);
return new Document[]{document}; return new Document[]{document};
} catch (final IOException e) { } catch (final IOException e) {
@ -153,7 +155,7 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd; return ppd;
} }
public static ContentScraper parseToScraper( public ContentScraper parseToScraper(
final DigestURL location, final DigestURL location,
final String documentCharset, final String documentCharset,
InputStream sourceStream, InputStream sourceStream,
@ -191,23 +193,21 @@ public class htmlParser extends AbstractParser implements Parser {
// wtf? still nothing, just take system-standard // wtf? still nothing, just take system-standard
if (charset == null) { if (charset == null) {
charset = Charset.defaultCharset().name(); detectedcharset = Charset.defaultCharset();
} } else {
Charset c;
try { try {
c = Charset.forName(charset); detectedcharset = Charset.forName(charset);
} catch (final IllegalCharsetNameException e) { } catch (final IllegalCharsetNameException e) {
c = Charset.defaultCharset(); detectedcharset = Charset.defaultCharset();
} catch (final UnsupportedCharsetException e) { } catch (final UnsupportedCharsetException e) {
c = Charset.defaultCharset(); detectedcharset = Charset.defaultCharset();
}
} }
// parsing the content // parsing the content
final ContentScraper scraper = new ContentScraper(location, maxLinks); final ContentScraper scraper = new ContentScraper(location, maxLinks);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try { try {
FileUtils.copy(sourceStream, writer, c); FileUtils.copy(sourceStream, writer, detectedcharset);
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location); throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally { } finally {

Loading…
Cancel
Save