Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

11 years ago · aac70fea2b
parent ba0e3fb0dc 49e76a1c55
commit aac70fea2b
2 changed files with 56 additions and 15 deletions
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -54,7 +54,8 @@ import com.ibm.icu.text.CharsetDetector;
 public class htmlParser extends AbstractParser implements Parser {

    private static final Pattern patternUnderline = Pattern.compile("_");
-    private static final int maxLinks = 10000;
+    private final int maxLinks = 10000;
+    private Charset detectedcharset;

    public htmlParser() {
        super("Streaming HTML Parser");
@ -97,7 +98,8 @@ public class htmlParser extends AbstractParser implements Parser {
        try {
            // first get a document from the parsed html
            final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
-            final Document document = transformScraper(location, mimeType, documentCharset, scraper);
+            // parseToScraper also detects/corrects/sets charset from html content tag
+            final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper);

            return new Document[]{document};
        } catch (final IOException e) {
@ -153,7 +155,7 @@ public class htmlParser extends AbstractParser implements Parser {
        return ppd;
    }

-    public static ContentScraper parseToScraper(
+    public ContentScraper parseToScraper(
            final DigestURL location,
            final String documentCharset,
            InputStream sourceStream,
@ -191,23 +193,21 @@ public class htmlParser extends AbstractParser implements Parser {

        // wtf? still nothing, just take system-standard
        if (charset == null) {
-            charset = Charset.defaultCharset().name();
-        }
-
-        Charset c;
-        try {
-            c = Charset.forName(charset);
-        } catch (final IllegalCharsetNameException e) {
-            c = Charset.defaultCharset();
-        } catch (final UnsupportedCharsetException e) {
-            c = Charset.defaultCharset();
+            detectedcharset = Charset.defaultCharset();
+        } else {
+            try {
+                detectedcharset = Charset.forName(charset);
+            } catch (final IllegalCharsetNameException e) {
+                detectedcharset = Charset.defaultCharset();
+            } catch (final UnsupportedCharsetException e) {
+                detectedcharset = Charset.defaultCharset();
+            }
        }
-
        // parsing the content
        final ContentScraper scraper = new ContentScraper(location, maxLinks);
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
        try {
-            FileUtils.copy(sourceStream, writer, c);
+            FileUtils.copy(sourceStream, writer, detectedcharset);
        } catch (final IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
        } finally {
--- a/test/net/yacy/document/parser/htmlParserTest.java
+++ b/test/net/yacy/document/parser/htmlParserTest.java
@ -1,7 +1,15 @@
 package net.yacy.document.parser;

+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.net.MalformedURLException;
 import java.nio.charset.Charset;
+import static junit.framework.Assert.assertTrue;
 import junit.framework.TestCase;
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.document.Document;
+import net.yacy.document.Parser;
 import org.junit.Test;

 public class htmlParserTest extends TestCase {
@ -39,4 +47,37 @@ public class htmlParserTest extends TestCase {
 		
 	}

+    /**
+     * Test of parse method, of class htmlParser.
+     * - test getCharset
+     */
+    @Test
+    public void testParse() throws MalformedURLException, Parser.Failure, InterruptedException, FileNotFoundException {
+        System.out.println("htmlParser.parse");
+
+        String[] testFiles = {
+            "umlaute_html_iso.html",
+            "umlaute_html_utf8.html",
+            "umlaute_html_namedentities.html"};
+
+        final String mimetype = "text/html";
+        //final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
+
+        for (String testfile : testFiles) {
+            final String filename = "test/parsertest/" + testfile;
+            final File file = new File(filename);
+
+            final AnchorURL url = new AnchorURL("http://localhost/" + filename);
+            System.out.println("parse file: " + filename);
+
+            htmlParser p = new htmlParser();
+            final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
+
+            Document doc = docs[0];
+            String txt = doc.getCharset();
+            assertTrue("get Charset", txt != null);
+            System.out.println("detected charset = " + txt);
+
+        }
+    }
 }