Finer control on max links to parse in the html parser.

8 years ago · 169ffdd1c7
parent 4743a104b5
commit 169ffdd1c7
6 changed files with 252 additions and 55 deletions
--- a/source/net/yacy/cora/storage/SizeLimitedMap.java
+++ b/source/net/yacy/cora/storage/SizeLimitedMap.java
@ -28,13 +28,28 @@ public class SizeLimitedMap<K, V> extends LinkedHashMap<K, V> implements Map<K,
 	private static final long serialVersionUID = 6088727126150060068L;
-	final int sizeLimit;
+	private final int sizeLimit;
 	/** Set to true when at least one eldest entry has been removed because the map size exceeded the size limit. */
 	private boolean limitExceeded;
 	public SizeLimitedMap(int sizeLimit) {
 		this.sizeLimit = sizeLimit;
 		this.limitExceeded = false;
 	}
    @Override protected boolean removeEldestEntry(final Map.Entry<K, V> eldest) {
-        return size() > this.sizeLimit;
+        boolean res = size() > this.sizeLimit;
        if(res) {
        	this.limitExceeded = true;
        }
        return res;
    }
    /**
     * @return true when the size limit has been exceeded at least one time
     */
    public boolean isLimitExceeded() {
 		return this.limitExceeded;
 	}
 }
--- a/source/net/yacy/cora/storage/SizeLimitedSet.java
+++ b/source/net/yacy/cora/storage/SizeLimitedSet.java
@ -72,6 +72,13 @@ public class SizeLimitedSet<E> extends AbstractSet<E> implements Set<E>, Cloneab
        map.clear();
    }
    /**
     * @return true when the size limit has been exceeded at least one time
     */
    public boolean isLimitExceeded() {
 		return this.map.isLimitExceeded();
 	}
    @Override
    @SuppressWarnings("unchecked")
 	public Object clone() {
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -187,12 +187,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    // class variables: collectors for links
    private final List<AnchorURL> anchors;
-    private final LinkedHashMap<DigestURL, String> rss, css;
+    private final SizeLimitedMap<DigestURL, String> rss, css;
-    private final LinkedHashMap<AnchorURL, EmbedEntry> embeds; // urlhash/embed relation
+    private final SizeLimitedMap<AnchorURL, EmbedEntry> embeds; // urlhash/embed relation
    private final List<ImageEntry> images; 
-    private final Set<AnchorURL> script, frames, iframes;
+    private final SizeLimitedSet<AnchorURL> script, frames, iframes;
-    private final Map<String, String> metas;
+    private final SizeLimitedMap<String, String> metas;
-    private final Map<String, DigestURL> hreflang, navigation;
+    private final SizeLimitedMap<String, DigestURL> hreflang, navigation;
    private LinkedHashSet<String> titles;
    private final List<String> articles;
    private final List<Date> startDates, endDates;
@ -204,7 +204,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final EventListenerList htmlFilterEventListeners;
    private double lon, lat;
    private AnchorURL canonical, publisher;
-    private final int maxLinks;
+    
    /** The maximum number of URLs to process and store in the anchors property. */
    private final int maxAnchors;
    private final VocabularyScraper vocabularyScraper;
    private final int timezoneOffset;
    private int breadcrumbs;
@ -226,21 +229,24 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    /** Set to true when a limit on content size scraped has been exceeded */
    private boolean contentSizeLimitExceeded;
    /** Set to true when the maxAnchors limit has been exceeded */
    private boolean maxAnchorsExceeded;
    /**
-     * scrape a document
+     * Create an ContentScraper instance
     * @param root the document root url
-     * @param maxLinks the maximum number of links to scrape
+     * @param maxAnchors the maximum number of URLs to process and store in the anchors property.
     * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
     * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
     * @param timezoneOffset local time zone offset
     */
    @SuppressWarnings("unchecked")
-    public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
+    public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
        // the root value here will not be used to load the resource.
        // it is only the reference for relative links
        super(linkTags0, linkTags1);
        assert root != null;
        this.root = root;
        this.maxLinks = maxLinks;
        this.vocabularyScraper = vocabularyScraper;
        this.timezoneOffset = timezoneOffset;
        this.evaluationScores = new Evaluation();
@ -277,6 +283,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.publisher = null;
        this.breadcrumbs = 0;
        this.contentSizeLimitExceeded = false;
        this.maxAnchorsExceeded = false;
        this.maxAnchors = maxAnchors;
    }
    /**
     * Create an ContentScraper instance
     * @param root the document root url
     * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
     * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
     * @param timezoneOffset local time zone offset
     */
    public ContentScraper(final DigestURL root, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
        this(root, Integer.MAX_VALUE, maxLinks, vocabularyScraper, timezoneOffset);
    }
    @Override
@ -366,7 +385,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            }
        }
-        findAbsoluteURLs(b, this.anchors, anchorListeners);
+        if(!this.maxAnchorsExceeded) {
        	int maxLinksToDetect = this.maxAnchors - this.anchors.size();
        	if(maxLinksToDetect < Integer.MAX_VALUE) {
        		/* Add one to the anchors limit to detect when the limit is exceeded */
        		maxLinksToDetect++;
        	}
        	findAbsoluteURLs(b, this.anchors, anchorListeners, maxLinksToDetect);
        	if(this.anchors.size() > this.maxAnchors) {
        		this.maxAnchorsExceeded = true;
        		this.anchors.remove(this.anchors.size() -1);
        	}
        }
        // append string to content
        if (!b.isEmpty()) {
@ -890,9 +920,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @param anchor anchor to add. Must not be null.
     */
    protected void addAnchor(AnchorURL anchor) {
    	if(this.anchors.size() >= this.maxAnchors) {
    		this.maxAnchorsExceeded = true;
    	} else {
    		this.anchors.add(anchor);
    		this.fireAddAnchor(anchor.toNormalform(false));
    	}
    }
    @Override
@ -1095,7 +1129,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    }
    /**
-     * @return true when a limit on content size scraped has been exceeded
+     * @return true when the limit on content size scraped has been exceeded
     */
    public boolean isContentSizeLimitExceeded() {
 		return this.contentSizeLimitExceeded;
@ -1108,6 +1142,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
 		this.contentSizeLimitExceeded = contentSizeLimitExceeded;
 	}
    /**
     * @return true when the maxAnchors limit has been exceeded
     */
    public boolean isMaxAnchorsExceeded() {
 		return this.maxAnchorsExceeded;
 	}
    /**
     * @return true when at least one limit on content size, anchors number or links number has been exceeded
     */
 	public boolean isLimitsExceeded() {
 		return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded()
 				|| this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded()
 				|| this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded()
 				|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded();
 	}
    /*
    DC in html example:
    <meta name="DC.title" lang="en" content="Expressing Dublin Core in HTML/XHTML meta and link elements" />
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -28,6 +28,8 @@ import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
@ -36,13 +38,15 @@ import java.nio.charset.StandardCharsets;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.LinkedHashMap;
 import org.apache.commons.io.IOUtils;
 import com.ibm.icu.text.CharsetDetector;
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.StreamLimitException;
 import net.yacy.cora.util.StrictLimitInputStream;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@ -51,14 +55,11 @@ import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.document.parser.html.ScraperInputStream;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.util.FileUtils;
 import com.ibm.icu.text.CharsetDetector;
 public class htmlParser extends AbstractParser implements Parser {
-	/** The default maximum number of links to add to a parsed document */
+	/** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */
    private static final int DEFAULT_MAX_LINKS = 10000;
    public htmlParser() {
@ -103,7 +104,7 @@ public class htmlParser extends AbstractParser implements Parser {
            final int timezoneOffset,
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {
-        return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
+        return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
    }
    @Override
@ -115,10 +116,16 @@ public class htmlParser extends AbstractParser implements Parser {
    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
    		final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
    		throws Failure {
        return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
    }
    private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
    		final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes)
    		throws Failure {
        try {
            // first get a document from the parsed html
            Charset[] detectedcharsetcontainer = new Charset[]{null};
-            ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, maxBytes);
+            ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
            // parseToScraper also detects/corrects/sets charset from html content tag
            final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
            Document documentSnapshot = null;
@ -127,10 +134,10 @@ public class htmlParser extends AbstractParser implements Parser {
                // and create a sub-document for snapshot page (which will be merged by loader)
                // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
                if (location.getRef() != null && location.getRef().startsWith("!")) {
-                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
+                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
                    if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
-                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
+                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                    }
                }
            } catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -190,12 +197,12 @@ public class htmlParser extends AbstractParser implements Parser {
                scraper.getDate());
        ppd.setScraperObject(scraper);
        ppd.setIcons(scraper.getIcons());
-        ppd.setPartiallyParsed(scraper.isContentSizeLimitExceeded());
+        ppd.setPartiallyParsed(scraper.isLimitsExceeded());
        return ppd;
    }
-    public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxLinks) throws IOException {
+    public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
        Charset[] detectedcharsetcontainer = new Charset[]{null};
        InputStream sourceStream;
        try {
@ -205,7 +212,7 @@ public class htmlParser extends AbstractParser implements Parser {
        }
        ContentScraper scraper; // for this static methode no need to init local this.scraperObject
        try {
-            scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, Long.MAX_VALUE);
+            scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
        } catch (Failure e) {
            throw new IOException(e.getMessage());
        }
@ -220,7 +227,8 @@ public class htmlParser extends AbstractParser implements Parser {
     * @param detectedcharsetcontainer a mutable array of Charsets : filled with the charset detected when parsing
     * @param timezoneOffset the local time zone offset
     * @param sourceStream an open stream on the resource to parse
-     * @param maxLinks the maximum number of links to store in the scraper
+     * @param maxAnchors the maximum number of URLs to process and store in the in the scraper's anchors property
     * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store in the scraper
     * @param maxBytes the maximum number of content bytes to process
     * @return a scraper containing parsed information
     * @throws Parser.Failure when an error occurred while parsing
@ -233,13 +241,10 @@ public class htmlParser extends AbstractParser implements Parser {
            final Charset[] detectedcharsetcontainer,
            final int timezoneOffset,
            InputStream sourceStream,
            final int maxAnchors,
            final int maxLinks,
            final long maxBytes) throws Parser.Failure, IOException {
    	if(maxBytes >= 0 && maxBytes < Long.MAX_VALUE) {
    		sourceStream = new StrictLimitInputStream(sourceStream, maxBytes);
    	}
        // make a scraper
        String charset = null;
@ -286,22 +291,24 @@ public class htmlParser extends AbstractParser implements Parser {
        }
        // parsing the content
-        // for this static methode no need to init local this.scraperObject here
+        // for this static method no need to init local this.scraperObject here
-        final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, vocabularyScraper, timezoneOffset);
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
        try {
-            FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
+        	final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
-        } catch(StreamLimitException e) {
+        	final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);
-        	/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
+			final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars);
            if(copiedChars > maxChars) {
            	/* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */
            	scraper.setContentSizeLimitExceeded(true);
-        } catch (final IOException e) {
+            } else if(copiedChars == maxChars) {
-    		/* A StreamLimitException may be itself wrapped in an IOException by a InputStreamReader */
+            	/* Exactly maxChars limit reached : let's check if more to read remain. */
-        	if(e.getCause() instanceof StreamLimitException) {
+            	if(sourceReader.read() >= 0) {
            	/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
            		scraper.setContentSizeLimitExceeded(true);
        	} else {
        		throw new Parser.Failure("IO error:" + e.getMessage(), location);
            	}
            }
        } catch (final IOException e) {
       		throw new Parser.Failure("IO error:" + e.getMessage(), location);
        } finally {
            writer.flush();
            //sourceStream.close(); keep open for multipe parsing (close done by caller)
@ -407,12 +414,13 @@ public class htmlParser extends AbstractParser implements Parser {
     * @param documentCharset
     * @param vocscraper
     * @param timezoneOffset
     * @param maxAnchors the maximum number of URLs to process and store in the in the scraper's anchors property
     * @param maxLinks the maximum number of links to store in the document
     * @param maxBytes the maximum number of content bytes to process
     * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
     */
    private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset,
-            final VocabularyScraper vocscraper, final int timezoneOffset, final int maxLinks, final long maxBytes) {
+            final VocabularyScraper vocscraper, final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
        Document documentSnapshot = null;
        try {
            // construct url for case (1) with anchor
@ -431,7 +439,7 @@ public class htmlParser extends AbstractParser implements Parser {
            InputStream snapshotStream = null;
            try {
            	snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-            	ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks, maxBytes);
+            	ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
                documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
            } finally {
            	if(snapshotStream != null) {
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@ -1,14 +1,20 @@
 package net.yacy.document.parser;
 import static net.yacy.document.parser.htmlParser.parseToScraper;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.Locale;
 import org.junit.Test;
 import junit.framework.TestCase;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.document.Document;
@ -16,8 +22,6 @@ import net.yacy.document.Parser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
 import static net.yacy.document.parser.htmlParser.parseToScraper;
 import org.junit.Test;
 public class htmlParserTest extends TestCase {
@ -99,6 +103,117 @@ public class htmlParserTest extends TestCase {
        }
    }
    /**
     * Test the htmlParser.parseWithLimits() method with test content within bounds.
     * @throws Exception when an unexpected error occurred
     */
    @Test
    public void testParseWithLimitsUnreached() throws Exception {
        System.out.println("htmlParser.parse");
        String[] testFiles = {
            "umlaute_html_iso.html",
            "umlaute_html_utf8.html",
            "umlaute_html_namedentities.html"};
        final String mimetype = "text/html";
        //final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
        htmlParser parser = new htmlParser();
        for (final String testfile : testFiles) {
            final String fileName = "test" + File.separator + "parsertest" + File.separator + testfile;
            final File file = new File(fileName);
            final AnchorURL url = new AnchorURL("http://localhost/" + fileName);
            try (final FileInputStream inStream = new FileInputStream(file);) {
                final Document[] docs = parser.parseWithLimits(url, mimetype, null, new VocabularyScraper(), 0, inStream, 1000, 10000);
                final Document doc = docs[0];
                assertNotNull("Parser result must not be null for file " + fileName, docs);
                final String parsedText = doc.getTextString();
 				assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
 				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
 						parsedText.contains("Maßkrügen"));
 				assertEquals("Test anchor must have been parsed for file " + fileName, 1, doc.getAnchors().size());
 				assertFalse("Parsed document should not be marked as partially parsed for file " + fileName, doc.isPartiallyParsed());
            }
        }
    }
 	/**
 	 * Test the htmlParser.parseWithLimits() method, with various maxLinks values
 	 * ranging from zero to the exact anchors number contained in the test content.
 	 * 
 	 * @throws Exception
 	 *             when an unexpected error occurred
 	 */
 	@Test
 	public void testParseWithLimitsOnAnchors() throws Exception {
 		final AnchorURL url = new AnchorURL("http://localhost/test.html");
 		final String mimetype = "text/html";
 		final String charset = StandardCharsets.UTF_8.name();
 		final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
 		testHtml.append("<a href=\"http://localhost/doc1.html\">First link</a>");
 		testHtml.append("<a href=\"http://localhost/doc2.html\">Second link</a>");
 		testHtml.append("<a href=\"http://localhost/doc3.html\">Third link</a>");
 		testHtml.append("</p></body></html>");
 		final htmlParser parser = new htmlParser();
 		for (int maxLinks = 0; maxLinks <= 3; maxLinks++) {
 			try (InputStream sourceStream = new ByteArrayInputStream(
 					testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
 				final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0,
 						sourceStream, maxLinks, Long.MAX_VALUE);
 				final Document doc = docs[0];
 				assertEquals(maxLinks, doc.getAnchors().size());
 				assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded",
 						maxLinks < 3, doc.isPartiallyParsed());
 			}
 		}
 	}
 	/**
 	 * Test the htmlParser.parseWithLimits() method, with various maxLinks values
 	 * ranging from zero the exact RSS feed links number contained in the test
 	 * content.
 	 * 
 	 * @throws Exception
 	 *             when an unexpected error occurred
 	 */
 	@Test
 	public void testParseWithLimitsOnRSSFeeds() throws Exception {
 		final AnchorURL url = new AnchorURL("http://localhost/test.html");
 		final String mimetype = "text/html";
 		final String charset = StandardCharsets.UTF_8.name();
 		final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html>");
 		testHtml.append("<head>");
 		testHtml.append(
 				"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Feed1\" href=\"http://localhost/rss1.xml\" />");
 		testHtml.append(
 				"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Feed2\" href=\"http://localhost/rss2.xml\" />");
 		testHtml.append(
 				"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Feed3\" href=\"http://localhost/rss3.xml\" />");
 		testHtml.append("</head>");
 		testHtml.append("<body><p>HTML test content</p></body></html>");
 		final htmlParser parser = new htmlParser();
 		for (int maxLinks = 0; maxLinks <= 3; maxLinks++) {
 			try (InputStream sourceStream = new ByteArrayInputStream(
 					testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
 				final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0,
 						sourceStream, maxLinks, Long.MAX_VALUE);
 				final Document doc = docs[0];
 				assertEquals(maxLinks, doc.getRSS().size());
 				assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded",
 						maxLinks < 3, doc.isPartiallyParsed());
 			}
 		}
    }
    /**
     * Test of parseToScraper method, of class htmlParser.
     */
@ -115,7 +230,7 @@ public class htmlParserTest extends TestCase {
                + "<figure><img width=\"550px\" title=\"image as exemple\" alt=\"image as exemple\" src=\"./img/my_image.png\"></figrue>" // + img width 550 (+html5 figure)
                + "</body></html>";
-        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10);
+        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
        List<AnchorURL> anchorlist = scraper.getAnchors();
        String linktxt = anchorlist.get(0).getTextProperty();
@ -157,7 +272,7 @@ public class htmlParserTest extends TestCase {
        }
        testHtml.append("</p></body></html>");
-        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), 10);
+        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), Integer.MAX_VALUE, Integer.MAX_VALUE);
        assertEquals(nestingDepth, scraper.getAnchors().size());
        assertEquals(1, scraper.getImages().size());
@ -178,7 +293,7 @@ public class htmlParserTest extends TestCase {
                + "<p>" + textSource + "</p>"
                + "</body></html>";
-        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10);
+        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
        String txt = scraper.getText();
        System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
@ -207,7 +322,7 @@ public class htmlParserTest extends TestCase {
                + "</head>\n"
                + "<body>" + textSource + "</body>\n"
                + "</html>";
-        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10);
+        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
        String txt = scraper.getText();
        System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]");
--- a/test/parsertest/umlaute_html_namedentities.html
+++ b/test/parsertest/umlaute_html_namedentities.html
@ -6,5 +6,6 @@
 <body>
 In M&uuml;nchen steht ein Hofbr&auml;uhaus.
 Dort gibt es Bier aus Ma&szlig;kr&uuml;gen.<br>
 <a href="http://localhost/umlaute_html_namedentities.html">Example link in HTML with named entities</a>
 </body>
 </html>