diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index e52b2aa1a..7f36b52a5 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -67,7 +67,9 @@ import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639;
-
+/**
+ * A content scraper supporting HTML tags.
+ */
public class ContentScraper extends AbstractScraper implements Scraper {
private final static int MAX_TAGSIZE = 1024 * 1024;
@@ -220,7 +222,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* evaluation scores: count appearance of specific attributes
*/
private final Evaluation evaluationScores;
-
+
+ /** Set to true when a limit on content size scraped has been exceeded */
+ private boolean contentSizeLimitExceeded;
+
/**
* scrape a document
* @param root the document root url
@@ -271,6 +276,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.canonical = null;
this.publisher = null;
this.breadcrumbs = 0;
+ this.contentSizeLimitExceeded = false;
}
@Override
@@ -608,8 +614,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return iconRels;
}
+ /**
+ * Process a tag processed as a singleton (no end tag, or not processing the eventual end tag)
+ * @param tag the tag to parse. Must not be null.
+ */
@Override
- public void scrapeTag0(Tag tag) {
+ public void scrapeTag0(final Tag tag) {
checkOpts(tag);
if (tag.name.equalsIgnoreCase("img")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
@@ -768,8 +778,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.fireScrapeTag0(tag.name, tag.opts);
}
+ /**
+ * Process a paired tag (has a start and an end tag)
+ * @param tag the tag to process. Must not be null.
+ */
@Override
- public void scrapeTag1(Tag tag) {
+ public void scrapeTag1(final Tag tag) {
checkOpts(tag);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
@@ -1079,7 +1093,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public Map getIcons() {
return this.icons;
}
-
+
+ /**
+ * @return true when a limit on content size scraped has been exceeded
+ */
+ public boolean isContentSizeLimitExceeded() {
+ return this.contentSizeLimitExceeded;
+ }
+
+ /**
+ * @param contentSizeLimitExceeded set to true when a limit on content size scraped has been exceeded
+ */
+ public void setContentSizeLimitExceeded(final boolean contentSizeLimitExceeded) {
+ this.contentSizeLimitExceeded = contentSizeLimitExceeded;
+ }
+
/*
DC in html example:
diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java
index 77798ed5c..ab4a7fa25 100644
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@@ -41,6 +41,8 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern;
+import net.yacy.cora.util.StreamLimitException;
+import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@@ -56,7 +58,8 @@ import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser {
- private static final int maxLinks = 10000;
+ /** The default maximum number of links to add to a parsed document */
+ private static final int DEFAULT_MAX_LINKS = 10000;
public htmlParser() {
super("Streaming HTML Parser");
@@ -100,10 +103,22 @@ public class htmlParser extends AbstractParser implements Parser {
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
+ return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
+ }
+
+ @Override
+ public boolean isParseWithLimitsSupported() {
+ return true;
+ }
+
+ @Override
+ public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
+ final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
+ throws Failure {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
- ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
+ ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, maxBytes);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null;
@@ -112,10 +127,10 @@ public class htmlParser extends AbstractParser implements Parser {
// and create a sub-document for snapshot page (which will be merged by loader)
// TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
if (location.getRef() != null && location.getRef().startsWith("!")) {
- documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset);
+ documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
} else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
- documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset);
+ documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
}
}
} catch (Exception ex1) { // ignore any exception for any issue with snapshot
@@ -127,6 +142,8 @@ public class htmlParser extends AbstractParser implements Parser {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
}
}
+
+
/**
* the transformScraper method transforms a scraper object into a document object
@@ -173,6 +190,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getDate());
ppd.setScraperObject(scraper);
ppd.setIcons(scraper.getIcons());
+ ppd.setPartiallyParsed(scraper.isContentSizeLimitExceeded());
return ppd;
}
@@ -187,21 +205,40 @@ public class htmlParser extends AbstractParser implements Parser {
}
ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try {
- scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
+ scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, Long.MAX_VALUE);
} catch (Failure e) {
throw new IOException(e.getMessage());
}
return scraper;
}
+ /**
+ * Parse the resource at location and return the resulting ContentScraper
+ * @param location the URL of the resource to parse
+ * @param documentCharset the document charset name if known
+ * @param vocabularyScraper a vocabulary scraper
+ * @param detectedcharsetcontainer a mutable array of Charsets : filled with the charset detected when parsing
+ * @param timezoneOffset the local time zone offset
+ * @param sourceStream an open stream on the resource to parse
+ * @param maxLinks the maximum number of links to store in the scraper
+ * @param maxBytes the maximum number of content bytes to process
+ * @return a scraper containing parsed information
+ * @throws Parser.Failure when an error occurred while parsing
+ * @throws IOException when a read/write error occurred while trying to detect the charset
+ */
public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
final VocabularyScraper vocabularyScraper,
- Charset[] detectedcharsetcontainer,
+ final Charset[] detectedcharsetcontainer,
final int timezoneOffset,
InputStream sourceStream,
- final int maxLinks) throws Parser.Failure, IOException {
+ final int maxLinks,
+ final long maxBytes) throws Parser.Failure, IOException {
+
+ if(maxBytes >= 0 && maxBytes < Long.MAX_VALUE) {
+ sourceStream = new StrictLimitInputStream(sourceStream, maxBytes);
+ }
// make a scraper
String charset = null;
@@ -254,8 +291,17 @@ public class htmlParser extends AbstractParser implements Parser {
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
+ } catch(StreamLimitException e) {
+ /* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
+ scraper.setContentSizeLimitExceeded(true);
} catch (final IOException e) {
- throw new Parser.Failure("IO error:" + e.getMessage(), location);
+ /* A StreamLimitException may be itself wrapped in an IOException by a InputStreamReader */
+ if(e.getCause() instanceof StreamLimitException) {
+ /* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
+ scraper.setContentSizeLimitExceeded(true);
+ } else {
+ throw new Parser.Failure("IO error:" + e.getMessage(), location);
+ }
} finally {
writer.flush();
//sourceStream.close(); keep open for multipe parsing (close done by caller)
@@ -361,10 +407,12 @@ public class htmlParser extends AbstractParser implements Parser {
* @param documentCharset
* @param vocscraper
* @param timezoneOffset
+ * @param maxLinks the maximum number of links to store in the document
+ * @param maxBytes the maximum number of content bytes to process
* @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
*/
private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset,
- final VocabularyScraper vocscraper, final int timezoneOffset) {
+ final VocabularyScraper vocscraper, final int timezoneOffset, final int maxLinks, final long maxBytes) {
Document documentSnapshot = null;
try {
// construct url for case (1) with anchor
@@ -383,7 +431,7 @@ public class htmlParser extends AbstractParser implements Parser {
InputStream snapshotStream = null;
try {
snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
- ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks);
+ ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks, maxBytes);
documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
} finally {
if(snapshotStream != null) {