add charset auto-detection for htmlParser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7186 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · e670e1ef8e
parent ddcd5ae78c
commit e670e1ef8e
5 changed files with 84 additions and 10 deletions
--- a/.classpath
+++ b/.classpath
@ -41,5 +41,6 @@
 	<classpathentry kind="lib" path="lib/commons-fileupload-1.2.2.jar"/>
 	<classpathentry kind="lib" path="lib/log4j-1.2.16.jar"/>
 	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
+	<classpathentry kind="lib" path="lib/icu4j-core.jar"/>
 	<classpathentry kind="output" path="gen"/>
 </classpath>
--- a/build.xml
+++ b/build.xml
@ -192,6 +192,7 @@
        <pathelement location="${lib}/httpclient-4.0.1.jar" />
        <pathelement location="${lib}/httpcore-4.0.1.jar" />
        <pathelement location="${lib}/httpmime-4.0.1.jar" />
+        <pathelement location="${lib}/icu4j-core.jar" />
        <pathelement location="${lib}/J7Zip-modified.jar" />
        <pathelement location="${lib}/jakarta-oro-2.0.8.jar" />
    	<pathelement location="${lib}/jcifs-1.3.14.jar" />
--- a/lib/icu4j-core.jar
+++ b/lib/icu4j-core.jar
--- a/lib/icu4j.license
+++ b/lib/icu4j.license
@ -0,0 +1,51 @@
+<html>
+
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"></meta>
+<title>ICU License - ICU 1.8.1 and later</title>
+</head>
+
+<body BGCOLOR="#ffffff">
+<h2>ICU License - ICU 1.8.1 and later</h2>
+
+<p>COPYRIGHT AND PERMISSION NOTICE</p>
+
+<p>
+Copyright (c) 1995-2010 International Business Machines Corporation and others
+</p>
+<p>
+All rights reserved.
+</p>
+<p>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, and/or sell
+copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies
+of the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+</p>
+<p>
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL
+THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM,
+OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
+USE OR PERFORMANCE OF THIS SOFTWARE.
+</p>
+<p>
+Except as contained in this notice, the name of a copyright holder shall not be
+used in advertising or otherwise to promote the sale, use or other dealings in
+this Software without prior written authorization of the copyright holder.
+</p>
+
+<hr>
+<p><small>
+All trademarks and registered trademarks mentioned herein are the property of their respective owners.
+</small></p>
+</body>
+</html>
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -20,6 +20,7 @@

 package net.yacy.document.parser;

+import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
@ -29,6 +30,8 @@ import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.regex.Pattern;

+import com.ibm.icu.text.CharsetDetector;
+
 import de.anomic.crawler.retrieval.HTTPLoader;

 import net.yacy.cora.document.MultiProtocolURI;
@ -78,42 +81,56 @@ public class htmlParser extends AbstractParser implements Parser {
    public static ContentScraper parseToScraper(
            final MultiProtocolURI location, 
            final String documentCharset, 
-            final InputStream sourceStream) throws Parser.Failure {
+            InputStream sourceStream) throws Parser.Failure, IOException {
        
        // make a scraper
-        final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
        String charset = null;

+        // ah, we are lucky, we got a character-encoding via HTTP-header
        if (documentCharset != null) {
            charset = patchCharsetEncoding(documentCharset);
        }
        
+        // nothing found: try to find a meta-tag
        if (charset == null) {
            try {
+                final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
+                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
            } catch (IOException e1) {
                throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
            }
        }
+
+        // the author didn't tell us the encoding, try the mozilla-heuristic
+        if (charset == null) {
+        	CharsetDetector det = new CharsetDetector();
+        	det.enableInputFilter(true);
+        	InputStream detStream = new BufferedInputStream(sourceStream);
+        	det.setText(detStream);
+        	charset = det.detect().getName();
+        	sourceStream = detStream;
+        }
        
+        // wtf? still nothing, just take system-standard
        if (charset == null) {
-            charset = patchCharsetEncoding(charset);
+            charset = Charset.defaultCharset().name();
        }
        
        Charset c;
        try {
-            c = Charset.forName(charset);
+        	c = Charset.forName(charset);
        } catch (IllegalCharsetNameException e) {
-            c = Charset.defaultCharset();
+        	c = Charset.defaultCharset();
        } catch (UnsupportedCharsetException e) {
-            c = Charset.defaultCharset();
+        	c = Charset.defaultCharset();
        }
        
        // parsing the content
        final ContentScraper scraper = new ContentScraper(location);        
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
        try {
-            FileUtils.copy(htmlFilter, writer, c);
+            FileUtils.copy(sourceStream, writer, c);
            writer.close();
        } catch (IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
@ -134,7 +151,11 @@ public class htmlParser extends AbstractParser implements Parser {
            final String documentCharset, 
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {
        
-        return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
+        try {
+			return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
+		} catch (IOException e) {
+			throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
+		}
    }

    private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
@ -173,8 +194,8 @@ public class htmlParser extends AbstractParser implements Parser {
     */
    public static String patchCharsetEncoding(String encoding) {
        
-        // return the system default encoding
-        if ((encoding == null) || (encoding.length() < 3)) return Charset.defaultCharset().name();
+        // do nothing with null
+        if ((encoding == null) || (encoding.length() < 3)) return null;
        
        // trim encoding string
        encoding = encoding.trim();