Added HTML parser support for maximum content bytes parsing limit

8 years ago · 5216c681a9
parent 4aafebc014
commit 5216c681a9
2 changed files with 91 additions and 15 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -67,7 +67,9 @@ import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.ISO639;

-
+/**
+ * A content scraper supporting HTML tags.
+ */
 public class ContentScraper extends AbstractScraper implements Scraper {

    private final static int MAX_TAGSIZE = 1024 * 1024;
@ -220,7 +222,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * evaluation scores: count appearance of specific attributes
     */
    private final Evaluation evaluationScores;
-
+    
+    /** Set to true when a limit on content size scraped has been exceeded */
+    private boolean contentSizeLimitExceeded;
+    
    /**
     * scrape a document
     * @param root the document root url
@ -271,6 +276,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.canonical = null;
        this.publisher = null;
        this.breadcrumbs = 0;
+        this.contentSizeLimitExceeded = false;
    }

    @Override
@ -608,8 +614,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    	return iconRels;
    }

+    /**
+     * Process a tag processed as a singleton (no end tag, or not processing the eventual end tag)
+     * @param tag the tag to parse. Must not be null.
+     */
    @Override
-    public void scrapeTag0(Tag tag) {
+    public void scrapeTag0(final Tag tag) {
        checkOpts(tag);
        if (tag.name.equalsIgnoreCase("img")) {
            final String src = tag.opts.getProperty("src", EMPTY_STRING);
@ -768,8 +778,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.fireScrapeTag0(tag.name, tag.opts);
    }

+    /**
+     * Process a paired tag (has a start and an end tag)
+     * @param tag the tag to process. Must not be null.
+     */
    @Override
-    public void scrapeTag1(Tag tag) {
+    public void scrapeTag1(final Tag tag) {
        checkOpts(tag);
        // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
        if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
@ -1079,7 +1093,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    public Map<DigestURL, IconEntry> getIcons() {
        return this.icons;
    }
-
+    
+    /**
+     * @return true when a limit on content size scraped has been exceeded
+     */
+    public boolean isContentSizeLimitExceeded() {
+		return this.contentSizeLimitExceeded;
+	}
+    
+    /**
+     * @param contentSizeLimitExceeded set to true when a limit on content size scraped has been exceeded
+     */
+    public void setContentSizeLimitExceeded(final boolean contentSizeLimitExceeded) {
+		this.contentSizeLimitExceeded = contentSizeLimitExceeded;
+	}
+    
    /*
    DC in html example:
    <meta name="DC.title" lang="en" content="Expressing Dublin Core in HTML/XHTML meta and link elements" />
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -41,6 +41,8 @@ import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.util.CommonPattern;
+import net.yacy.cora.util.StreamLimitException;
+import net.yacy.cora.util.StrictLimitInputStream;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@ -56,7 +58,8 @@ import com.ibm.icu.text.CharsetDetector;

 public class htmlParser extends AbstractParser implements Parser {

-    private static final int maxLinks = 10000;
+	/** The default maximum number of links to add to a parsed document */
+    private static final int DEFAULT_MAX_LINKS = 10000;

    public htmlParser() {
        super("Streaming HTML Parser");
@ -100,10 +103,22 @@ public class htmlParser extends AbstractParser implements Parser {
            final int timezoneOffset,
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {

+        return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
+    }
+    
+    @Override
+    public boolean isParseWithLimitsSupported() {
+    	return true;
+    }
+    
+    @Override
+    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
+    		final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
+    		throws Failure {
        try {
            // first get a document from the parsed html
            Charset[] detectedcharsetcontainer = new Charset[]{null};
-            ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
+            ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, maxBytes);
            // parseToScraper also detects/corrects/sets charset from html content tag
            final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
            Document documentSnapshot = null;
@ -112,10 +127,10 @@ public class htmlParser extends AbstractParser implements Parser {
                // and create a sub-document for snapshot page (which will be merged by loader)
                // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
                if (location.getRef() != null && location.getRef().startsWith("!")) {
-                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset);
+                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
                } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
                    if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
-                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset);
+                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
                    }
                }
            } catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -127,6 +142,8 @@ public class htmlParser extends AbstractParser implements Parser {
            throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
        }
    }
+    
+    

    /**
     *  the transformScraper method transforms a scraper object into a document object
@ -173,6 +190,7 @@ public class htmlParser extends AbstractParser implements Parser {
                scraper.getDate());
        ppd.setScraperObject(scraper);
        ppd.setIcons(scraper.getIcons());
+        ppd.setPartiallyParsed(scraper.isContentSizeLimitExceeded());
        
        return ppd;
    }
@ -187,21 +205,40 @@ public class htmlParser extends AbstractParser implements Parser {
        }
        ContentScraper scraper; // for this static methode no need to init local this.scraperObject
        try {
-            scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
+            scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, Long.MAX_VALUE);
        } catch (Failure e) {
            throw new IOException(e.getMessage());
        }
        return scraper;
    }
    
+    /**
+     * Parse the resource at location and return the resulting ContentScraper
+     * @param location the URL of the resource to parse
+     * @param documentCharset the document charset name if known
+     * @param vocabularyScraper a vocabulary scraper
+     * @param detectedcharsetcontainer a mutable array of Charsets : filled with the charset detected when parsing
+     * @param timezoneOffset the local time zone offset
+     * @param sourceStream an open stream on the resource to parse
+     * @param maxLinks the maximum number of links to store in the scraper
+     * @param maxBytes the maximum number of content bytes to process
+     * @return a scraper containing parsed information
+     * @throws Parser.Failure when an error occurred while parsing
+     * @throws IOException when a read/write error occurred while trying to detect the charset
+     */
    public static ContentScraper parseToScraper(
            final DigestURL location,
            final String documentCharset,
            final VocabularyScraper vocabularyScraper,
-            Charset[] detectedcharsetcontainer,
+            final Charset[] detectedcharsetcontainer,
            final int timezoneOffset,
            InputStream sourceStream,
-            final int maxLinks) throws Parser.Failure, IOException {
+            final int maxLinks,
+            final long maxBytes) throws Parser.Failure, IOException {
+    	
+    	if(maxBytes >= 0 && maxBytes < Long.MAX_VALUE) {
+    		sourceStream = new StrictLimitInputStream(sourceStream, maxBytes);
+    	}

        // make a scraper
        String charset = null;
@ -254,8 +291,17 @@ public class htmlParser extends AbstractParser implements Parser {
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
        try {
            FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
+        } catch(StreamLimitException e) {
+        	/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
+        	scraper.setContentSizeLimitExceeded(true);
        } catch (final IOException e) {
-            throw new Parser.Failure("IO error:" + e.getMessage(), location);
+    		/* A StreamLimitException may be itself wrapped in an IOException by a InputStreamReader */
+        	if(e.getCause() instanceof StreamLimitException) {
+            	/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
+            	scraper.setContentSizeLimitExceeded(true);
+        	} else {
+        		throw new Parser.Failure("IO error:" + e.getMessage(), location);
+        	}
        } finally {
            writer.flush();
            //sourceStream.close(); keep open for multipe parsing (close done by caller)
@ -361,10 +407,12 @@ public class htmlParser extends AbstractParser implements Parser {
     * @param documentCharset
     * @param vocscraper
     * @param timezoneOffset
+     * @param maxLinks the maximum number of links to store in the document
+     * @param maxBytes the maximum number of content bytes to process
     * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
     */
    private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset,
-            final VocabularyScraper vocscraper, final int timezoneOffset) {
+            final VocabularyScraper vocscraper, final int timezoneOffset, final int maxLinks, final long maxBytes) {
        Document documentSnapshot = null;
        try {
            // construct url for case (1) with anchor
@ -383,7 +431,7 @@ public class htmlParser extends AbstractParser implements Parser {
            InputStream snapshotStream = null;
            try {
            	snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-            	ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks);
+            	ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks, maxBytes);
                documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
            } finally {
            	if(snapshotStream != null) {