Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

pull/1/head
Michael Peter Christen 11 years ago
commit aac70fea2b

@ -54,7 +54,8 @@ import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_");
private static final int maxLinks = 10000;
private final int maxLinks = 10000;
private Charset detectedcharset;
public htmlParser() {
super("Streaming HTML Parser");
@ -97,7 +98,8 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
final Document document = transformScraper(location, mimeType, documentCharset, scraper);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper);
return new Document[]{document};
} catch (final IOException e) {
@ -153,7 +155,7 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd;
}
public static ContentScraper parseToScraper(
public ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
InputStream sourceStream,
@ -191,23 +193,21 @@ public class htmlParser extends AbstractParser implements Parser {
// wtf? still nothing, just take system-standard
if (charset == null) {
charset = Charset.defaultCharset().name();
}
Charset c;
try {
c = Charset.forName(charset);
} catch (final IllegalCharsetNameException e) {
c = Charset.defaultCharset();
} catch (final UnsupportedCharsetException e) {
c = Charset.defaultCharset();
detectedcharset = Charset.defaultCharset();
} else {
try {
detectedcharset = Charset.forName(charset);
} catch (final IllegalCharsetNameException e) {
detectedcharset = Charset.defaultCharset();
} catch (final UnsupportedCharsetException e) {
detectedcharset = Charset.defaultCharset();
}
}
// parsing the content
final ContentScraper scraper = new ContentScraper(location, maxLinks);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
FileUtils.copy(sourceStream, writer, c);
FileUtils.copy(sourceStream, writer, detectedcharset);
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {

@ -1,7 +1,15 @@
package net.yacy.document.parser;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import static junit.framework.Assert.assertTrue;
import junit.framework.TestCase;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import org.junit.Test;
public class htmlParserTest extends TestCase {
@ -39,4 +47,37 @@ public class htmlParserTest extends TestCase {
}
/**
* Test of parse method, of class htmlParser.
* - test getCharset
*/
@Test
public void testParse() throws MalformedURLException, Parser.Failure, InterruptedException, FileNotFoundException {
System.out.println("htmlParser.parse");
String[] testFiles = {
"umlaute_html_iso.html",
"umlaute_html_utf8.html",
"umlaute_html_namedentities.html"};
final String mimetype = "text/html";
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
for (String testfile : testFiles) {
final String filename = "test/parsertest/" + testfile;
final File file = new File(filename);
final AnchorURL url = new AnchorURL("http://localhost/" + filename);
System.out.println("parse file: " + filename);
htmlParser p = new htmlParser();
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
Document doc = docs[0];
String txt = doc.getCharset();
assertTrue("get Charset", txt != null);
System.out.println("detected charset = " + txt);
}
}
}

Loading…
Cancel
Save