add charset auto-detection for htmlParser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7186 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 15 years ago
parent ddcd5ae78c
commit e670e1ef8e

@ -41,5 +41,6 @@
<classpathentry kind="lib" path="lib/commons-fileupload-1.2.2.jar"/> <classpathentry kind="lib" path="lib/commons-fileupload-1.2.2.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.16.jar"/> <classpathentry kind="lib" path="lib/log4j-1.2.16.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/> <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="lib" path="lib/icu4j-core.jar"/>
<classpathentry kind="output" path="gen"/> <classpathentry kind="output" path="gen"/>
</classpath> </classpath>

@ -192,6 +192,7 @@
<pathelement location="${lib}/httpclient-4.0.1.jar" /> <pathelement location="${lib}/httpclient-4.0.1.jar" />
<pathelement location="${lib}/httpcore-4.0.1.jar" /> <pathelement location="${lib}/httpcore-4.0.1.jar" />
<pathelement location="${lib}/httpmime-4.0.1.jar" /> <pathelement location="${lib}/httpmime-4.0.1.jar" />
<pathelement location="${lib}/icu4j-core.jar" />
<pathelement location="${lib}/J7Zip-modified.jar" /> <pathelement location="${lib}/J7Zip-modified.jar" />
<pathelement location="${lib}/jakarta-oro-2.0.8.jar" /> <pathelement location="${lib}/jakarta-oro-2.0.8.jar" />
<pathelement location="${lib}/jcifs-1.3.14.jar" /> <pathelement location="${lib}/jcifs-1.3.14.jar" />

Binary file not shown.

@ -0,0 +1,51 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"></meta>
<title>ICU License - ICU 1.8.1 and later</title>
</head>
<body BGCOLOR="#ffffff">
<h2>ICU License - ICU 1.8.1 and later</h2>
<p>COPYRIGHT AND PERMISSION NOTICE</p>
<p>
Copyright (c) 1995-2010 International Business Machines Corporation and others
</p>
<p>
All rights reserved.
</p>
<p>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies
of the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
</p>
<p>
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL
THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM,
OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
USE OR PERFORMANCE OF THIS SOFTWARE.
</p>
<p>
Except as contained in this notice, the name of a copyright holder shall not be
used in advertising or otherwise to promote the sale, use or other dealings in
this Software without prior written authorization of the copyright holder.
</p>
<hr>
<p><small>
All trademarks and registered trademarks mentioned herein are the property of their respective owners.
</small></p>
</body>
</html>

@ -20,6 +20,7 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -29,6 +30,8 @@ import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException; import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import com.ibm.icu.text.CharsetDetector;
import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.HTTPLoader;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
@ -78,42 +81,56 @@ public class htmlParser extends AbstractParser implements Parser {
public static ContentScraper parseToScraper( public static ContentScraper parseToScraper(
final MultiProtocolURI location, final MultiProtocolURI location,
final String documentCharset, final String documentCharset,
final InputStream sourceStream) throws Parser.Failure { InputStream sourceStream) throws Parser.Failure, IOException {
// make a scraper // make a scraper
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
String charset = null; String charset = null;
// ah, we are lucky, we got a character-encoding via HTTP-header
if (documentCharset != null) { if (documentCharset != null) {
charset = patchCharsetEncoding(documentCharset); charset = patchCharsetEncoding(documentCharset);
} }
// nothing found: try to find a meta-tag
if (charset == null) { if (charset == null) {
try { try {
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset(); charset = htmlFilter.detectCharset();
} catch (IOException e1) { } catch (IOException e1) {
throw new Parser.Failure("Charset error:" + e1.getMessage(), location); throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
} }
} }
// the author didn't tell us the encoding, try the mozilla-heuristic
if (charset == null) {
CharsetDetector det = new CharsetDetector();
det.enableInputFilter(true);
InputStream detStream = new BufferedInputStream(sourceStream);
det.setText(detStream);
charset = det.detect().getName();
sourceStream = detStream;
}
// wtf? still nothing, just take system-standard
if (charset == null) { if (charset == null) {
charset = patchCharsetEncoding(charset); charset = Charset.defaultCharset().name();
} }
Charset c; Charset c;
try { try {
c = Charset.forName(charset); c = Charset.forName(charset);
} catch (IllegalCharsetNameException e) { } catch (IllegalCharsetNameException e) {
c = Charset.defaultCharset(); c = Charset.defaultCharset();
} catch (UnsupportedCharsetException e) { } catch (UnsupportedCharsetException e) {
c = Charset.defaultCharset(); c = Charset.defaultCharset();
} }
// parsing the content // parsing the content
final ContentScraper scraper = new ContentScraper(location); final ContentScraper scraper = new ContentScraper(location);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
try { try {
FileUtils.copy(htmlFilter, writer, c); FileUtils.copy(sourceStream, writer, c);
writer.close(); writer.close();
} catch (IOException e) { } catch (IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location); throw new Parser.Failure("IO error:" + e.getMessage(), location);
@ -134,7 +151,11 @@ public class htmlParser extends AbstractParser implements Parser {
final String documentCharset, final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream)); try {
return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
} catch (IOException e) {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
}
} }
private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) { private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
@ -173,8 +194,8 @@ public class htmlParser extends AbstractParser implements Parser {
*/ */
public static String patchCharsetEncoding(String encoding) { public static String patchCharsetEncoding(String encoding) {
// return the system default encoding // do nothing with null
if ((encoding == null) || (encoding.length() < 3)) return Charset.defaultCharset().name(); if ((encoding == null) || (encoding.length() < 3)) return null;
// trim encoding string // trim encoding string
encoding = encoding.trim(); encoding = encoding.trim();

Loading…
Cancel
Save