diff --git a/.classpath b/.classpath index 5c4063b3c..8cc2580c5 100644 --- a/.classpath +++ b/.classpath @@ -41,5 +41,6 @@ + diff --git a/build.xml b/build.xml index d81a1d039..eb083f4f8 100644 --- a/build.xml +++ b/build.xml @@ -192,6 +192,7 @@ + diff --git a/lib/icu4j-core.jar b/lib/icu4j-core.jar new file mode 100644 index 000000000..b62189c04 Binary files /dev/null and b/lib/icu4j-core.jar differ diff --git a/lib/icu4j.license b/lib/icu4j.license new file mode 100644 index 000000000..4879f2b44 --- /dev/null +++ b/lib/icu4j.license @@ -0,0 +1,51 @@ + + + + +ICU License - ICU 1.8.1 and later + + + +

ICU License - ICU 1.8.1 and later

+ +

COPYRIGHT AND PERMISSION NOTICE

+ +

+Copyright (c) 1995-2010 International Business Machines Corporation and others +

+

+All rights reserved. +

+

+Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, and/or sell +copies of the Software, and to permit persons +to whom the Software is furnished to do so, provided that the above +copyright notice(s) and this permission notice appear in all copies +of the Software and that both the above copyright notice(s) and this +permission notice appear in supporting documentation. +

+

+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL +THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, +OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER +RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE +USE OR PERFORMANCE OF THIS SOFTWARE. +

+

+Except as contained in this notice, the name of a copyright holder shall not be +used in advertising or otherwise to promote the sale, use or other dealings in +this Software without prior written authorization of the copyright holder. +

+ +
+

+All trademarks and registered trademarks mentioned herein are the property of their respective owners. +

+ + diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index a7d4c891e..d0946769c 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -20,6 +20,7 @@ package net.yacy.document.parser; +import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; @@ -29,6 +30,8 @@ import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.util.regex.Pattern; +import com.ibm.icu.text.CharsetDetector; + import de.anomic.crawler.retrieval.HTTPLoader; import net.yacy.cora.document.MultiProtocolURI; @@ -78,42 +81,56 @@ public class htmlParser extends AbstractParser implements Parser { public static ContentScraper parseToScraper( final MultiProtocolURI location, final String documentCharset, - final InputStream sourceStream) throws Parser.Failure { + InputStream sourceStream) throws Parser.Failure, IOException { // make a scraper - final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false); String charset = null; + // ah, we are lucky, we got a character-encoding via HTTP-header if (documentCharset != null) { charset = patchCharsetEncoding(documentCharset); } + // nothing found: try to find a meta-tag if (charset == null) { try { + final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false); + sourceStream = htmlFilter; charset = htmlFilter.detectCharset(); } catch (IOException e1) { throw new Parser.Failure("Charset error:" + e1.getMessage(), location); } } + + // the author didn't tell us the encoding, try the mozilla-heuristic + if (charset == null) { + CharsetDetector det = new CharsetDetector(); + det.enableInputFilter(true); + InputStream detStream = new BufferedInputStream(sourceStream); + det.setText(detStream); + charset = det.detect().getName(); + sourceStream = detStream; + } + // wtf? still nothing, just take system-standard if (charset == null) { - charset = patchCharsetEncoding(charset); + charset = Charset.defaultCharset().name(); } Charset c; try { - c = Charset.forName(charset); + c = Charset.forName(charset); } catch (IllegalCharsetNameException e) { - c = Charset.defaultCharset(); + c = Charset.defaultCharset(); } catch (UnsupportedCharsetException e) { - c = Charset.defaultCharset(); + c = Charset.defaultCharset(); } // parsing the content final ContentScraper scraper = new ContentScraper(location); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false); try { - FileUtils.copy(htmlFilter, writer, c); + FileUtils.copy(sourceStream, writer, c); writer.close(); } catch (IOException e) { throw new Parser.Failure("IO error:" + e.getMessage(), location); @@ -134,7 +151,11 @@ public class htmlParser extends AbstractParser implements Parser { final String documentCharset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { - return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream)); + try { + return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream)); + } catch (IOException e) { + throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location); + } } private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) { @@ -173,8 +194,8 @@ public class htmlParser extends AbstractParser implements Parser { */ public static String patchCharsetEncoding(String encoding) { - // return the system default encoding - if ((encoding == null) || (encoding.length() < 3)) return Charset.defaultCharset().name(); + // do nothing with null + if ((encoding == null) || (encoding.length() < 3)) return null; // trim encoding string encoding = encoding.trim();