Added HTML parser support for maximum content bytes parsing limit

pull/127/head
luccioman 7 years ago
parent 4aafebc014
commit 5216c681a9

@ -67,7 +67,9 @@ import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.ISO639;
/**
* A content scraper supporting HTML tags.
*/
public class ContentScraper extends AbstractScraper implements Scraper { public class ContentScraper extends AbstractScraper implements Scraper {
private final static int MAX_TAGSIZE = 1024 * 1024; private final static int MAX_TAGSIZE = 1024 * 1024;
@ -220,7 +222,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* evaluation scores: count appearance of specific attributes * evaluation scores: count appearance of specific attributes
*/ */
private final Evaluation evaluationScores; private final Evaluation evaluationScores;
/** Set to true when a limit on content size scraped has been exceeded */
private boolean contentSizeLimitExceeded;
/** /**
* scrape a document * scrape a document
* @param root the document root url * @param root the document root url
@ -271,6 +276,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.canonical = null; this.canonical = null;
this.publisher = null; this.publisher = null;
this.breadcrumbs = 0; this.breadcrumbs = 0;
this.contentSizeLimitExceeded = false;
} }
@Override @Override
@ -608,8 +614,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return iconRels; return iconRels;
} }
/**
* Process a tag processed as a singleton (no end tag, or not processing the eventual end tag)
* @param tag the tag to parse. Must not be null.
*/
@Override @Override
public void scrapeTag0(Tag tag) { public void scrapeTag0(final Tag tag) {
checkOpts(tag); checkOpts(tag);
if (tag.name.equalsIgnoreCase("img")) { if (tag.name.equalsIgnoreCase("img")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING); final String src = tag.opts.getProperty("src", EMPTY_STRING);
@ -768,8 +778,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.fireScrapeTag0(tag.name, tag.opts); this.fireScrapeTag0(tag.name, tag.opts);
} }
/**
* Process a paired tag (has a start and an end tag)
* @param tag the tag to process. Must not be null.
*/
@Override @Override
public void scrapeTag1(Tag tag) { public void scrapeTag1(final Tag tag) {
checkOpts(tag); checkOpts(tag);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
@ -1079,7 +1093,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public Map<DigestURL, IconEntry> getIcons() { public Map<DigestURL, IconEntry> getIcons() {
return this.icons; return this.icons;
} }
/**
* @return true when a limit on content size scraped has been exceeded
*/
public boolean isContentSizeLimitExceeded() {
return this.contentSizeLimitExceeded;
}
/**
* @param contentSizeLimitExceeded set to true when a limit on content size scraped has been exceeded
*/
public void setContentSizeLimitExceeded(final boolean contentSizeLimitExceeded) {
this.contentSizeLimitExceeded = contentSizeLimitExceeded;
}
/* /*
DC in html example: DC in html example:
<meta name="DC.title" lang="en" content="Expressing Dublin Core in HTML/XHTML meta and link elements" /> <meta name="DC.title" lang="en" content="Expressing Dublin Core in HTML/XHTML meta and link elements" />

@ -41,6 +41,8 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.StreamLimitException;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -56,7 +58,8 @@ import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser { public class htmlParser extends AbstractParser implements Parser {
private static final int maxLinks = 10000; /** The default maximum number of links to add to a parsed document */
private static final int DEFAULT_MAX_LINKS = 10000;
public htmlParser() { public htmlParser() {
super("Streaming HTML Parser"); super("Streaming HTML Parser");
@ -100,10 +103,22 @@ public class htmlParser extends AbstractParser implements Parser {
final int timezoneOffset, final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
throws Failure {
try { try {
// first get a document from the parsed html // first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, maxBytes);
// parseToScraper also detects/corrects/sets charset from html content tag // parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null; Document documentSnapshot = null;
@ -112,10 +127,10 @@ public class htmlParser extends AbstractParser implements Parser {
// and create a sub-document for snapshot page (which will be merged by loader) // and create a sub-document for snapshot page (which will be merged by loader)
// TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
if (location.getRef() != null && location.getRef().startsWith("!")) { if (location.getRef() != null && location.getRef().startsWith("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset); documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
} else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both) } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) { if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset); documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
} }
} }
} catch (Exception ex1) { // ignore any exception for any issue with snapshot } catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -127,6 +142,8 @@ public class htmlParser extends AbstractParser implements Parser {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location); throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
} }
} }
/** /**
* the transformScraper method transforms a scraper object into a document object * the transformScraper method transforms a scraper object into a document object
@ -173,6 +190,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getDate()); scraper.getDate());
ppd.setScraperObject(scraper); ppd.setScraperObject(scraper);
ppd.setIcons(scraper.getIcons()); ppd.setIcons(scraper.getIcons());
ppd.setPartiallyParsed(scraper.isContentSizeLimitExceeded());
return ppd; return ppd;
} }
@ -187,21 +205,40 @@ public class htmlParser extends AbstractParser implements Parser {
} }
ContentScraper scraper; // for this static methode no need to init local this.scraperObject ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try { try {
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, Long.MAX_VALUE);
} catch (Failure e) { } catch (Failure e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} }
return scraper; return scraper;
} }
/**
* Parse the resource at location and return the resulting ContentScraper
* @param location the URL of the resource to parse
* @param documentCharset the document charset name if known
* @param vocabularyScraper a vocabulary scraper
* @param detectedcharsetcontainer a mutable array of Charsets : filled with the charset detected when parsing
* @param timezoneOffset the local time zone offset
* @param sourceStream an open stream on the resource to parse
* @param maxLinks the maximum number of links to store in the scraper
* @param maxBytes the maximum number of content bytes to process
* @return a scraper containing parsed information
* @throws Parser.Failure when an error occurred while parsing
* @throws IOException when a read/write error occurred while trying to detect the charset
*/
public static ContentScraper parseToScraper( public static ContentScraper parseToScraper(
final DigestURL location, final DigestURL location,
final String documentCharset, final String documentCharset,
final VocabularyScraper vocabularyScraper, final VocabularyScraper vocabularyScraper,
Charset[] detectedcharsetcontainer, final Charset[] detectedcharsetcontainer,
final int timezoneOffset, final int timezoneOffset,
InputStream sourceStream, InputStream sourceStream,
final int maxLinks) throws Parser.Failure, IOException { final int maxLinks,
final long maxBytes) throws Parser.Failure, IOException {
if(maxBytes >= 0 && maxBytes < Long.MAX_VALUE) {
sourceStream = new StrictLimitInputStream(sourceStream, maxBytes);
}
// make a scraper // make a scraper
String charset = null; String charset = null;
@ -254,8 +291,17 @@ public class htmlParser extends AbstractParser implements Parser {
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try { try {
FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]); FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
} catch(StreamLimitException e) {
/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
scraper.setContentSizeLimitExceeded(true);
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location); /* A StreamLimitException may be itself wrapped in an IOException by a InputStreamReader */
if(e.getCause() instanceof StreamLimitException) {
/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
scraper.setContentSizeLimitExceeded(true);
} else {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
}
} finally { } finally {
writer.flush(); writer.flush();
//sourceStream.close(); keep open for multipe parsing (close done by caller) //sourceStream.close(); keep open for multipe parsing (close done by caller)
@ -361,10 +407,12 @@ public class htmlParser extends AbstractParser implements Parser {
* @param documentCharset * @param documentCharset
* @param vocscraper * @param vocscraper
* @param timezoneOffset * @param timezoneOffset
* @param maxLinks the maximum number of links to store in the document
* @param maxBytes the maximum number of content bytes to process
* @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
*/ */
private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset, private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset,
final VocabularyScraper vocscraper, final int timezoneOffset) { final VocabularyScraper vocscraper, final int timezoneOffset, final int maxLinks, final long maxBytes) {
Document documentSnapshot = null; Document documentSnapshot = null;
try { try {
// construct url for case (1) with anchor // construct url for case (1) with anchor
@ -383,7 +431,7 @@ public class htmlParser extends AbstractParser implements Parser {
InputStream snapshotStream = null; InputStream snapshotStream = null;
try { try {
snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks); ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks, maxBytes);
documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot); documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
} finally { } finally {
if(snapshotStream != null) { if(snapshotStream != null) {

Loading…
Cancel
Save