reuse code from htmlParser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7184 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 15 years ago
parent daeea96aea
commit 8fe1102452

@ -75,13 +75,12 @@ public class htmlParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("text/csv");
}
public Document[] parse(
public static ContentScraper parseToScraper(
final MultiProtocolURI location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
final InputStream sourceStream) throws Parser.Failure {
// make a scraper and transformer
// make a scraper
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
String charset = null;
try {
@ -95,10 +94,6 @@ public class htmlParser extends AbstractParser implements Parser {
charset = patchCharsetEncoding(charset);
}
if (documentCharset == null || !documentCharset.equalsIgnoreCase(charset)) {
log.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true));
}
Charset c;
try {
c = Charset.forName(charset);
@ -122,10 +117,18 @@ public class htmlParser extends AbstractParser implements Parser {
//hfos.close();
if (writer.binarySuspect()) {
final String errorMsg = "Binary data found in resource";
log.logSevere("Unable to parse '" + location + "'. " + errorMsg);
throw new Parser.Failure(errorMsg,location);
throw new Parser.Failure(errorMsg, location);
}
return transformScraper(location, mimeType, documentCharset, scraper);
return scraper;
}
public Document[] parse(
final MultiProtocolURI location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
}
private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {

@ -29,7 +29,6 @@ package net.yacy.repository;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Date;
@ -48,8 +47,8 @@ import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -327,12 +326,11 @@ public final class LoaderDispatcher {
byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
// scrape content
final ContentScraper scraper = new ContentScraper(location);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
writer.write(new String(page, "UTF-8"));
return scraper;
try {
return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
} catch(Parser.Failure e) {
throw new IOException(e.getMessage());
}
}
/**

Loading…
Cancel
Save