diff --git a/source/net/yacy/cora/storage/SizeLimitedMap.java b/source/net/yacy/cora/storage/SizeLimitedMap.java index 1e885213e..0d15d7a77 100644 --- a/source/net/yacy/cora/storage/SizeLimitedMap.java +++ b/source/net/yacy/cora/storage/SizeLimitedMap.java @@ -28,13 +28,28 @@ public class SizeLimitedMap extends LinkedHashMap implements Map eldest) { - return size() > this.sizeLimit; + boolean res = size() > this.sizeLimit; + if(res) { + this.limitExceeded = true; + } + return res; } + + /** + * @return true when the size limit has been exceeded at least one time + */ + public boolean isLimitExceeded() { + return this.limitExceeded; + } } diff --git a/source/net/yacy/cora/storage/SizeLimitedSet.java b/source/net/yacy/cora/storage/SizeLimitedSet.java index 110de87a6..ecb3701b9 100644 --- a/source/net/yacy/cora/storage/SizeLimitedSet.java +++ b/source/net/yacy/cora/storage/SizeLimitedSet.java @@ -71,6 +71,13 @@ public class SizeLimitedSet extends AbstractSet implements Set, Cloneab public void clear() { map.clear(); } + + /** + * @return true when the size limit has been exceeded at least one time + */ + public boolean isLimitExceeded() { + return this.map.isLimitExceeded(); + } @Override @SuppressWarnings("unchecked") diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 7f36b52a5..f99678885 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -187,12 +187,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { // class variables: collectors for links private final List anchors; - private final LinkedHashMap rss, css; - private final LinkedHashMap embeds; // urlhash/embed relation + private final SizeLimitedMap rss, css; + private final SizeLimitedMap embeds; // urlhash/embed relation private final List images; - private final Set script, frames, iframes; - private final Map metas; - private final Map hreflang, navigation; + private final SizeLimitedSet script, frames, iframes; + private final SizeLimitedMap metas; + private final SizeLimitedMap hreflang, navigation; private LinkedHashSet titles; private final List articles; private final List startDates, endDates; @@ -204,7 +204,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final EventListenerList htmlFilterEventListeners; private double lon, lat; private AnchorURL canonical, publisher; - private final int maxLinks; + + /** The maximum number of URLs to process and store in the anchors property. */ + private final int maxAnchors; + private final VocabularyScraper vocabularyScraper; private final int timezoneOffset; private int breadcrumbs; @@ -226,21 +229,24 @@ public class ContentScraper extends AbstractScraper implements Scraper { /** Set to true when a limit on content size scraped has been exceeded */ private boolean contentSizeLimitExceeded; + /** Set to true when the maxAnchors limit has been exceeded */ + private boolean maxAnchorsExceeded; + /** - * scrape a document + * Create an ContentScraper instance * @param root the document root url - * @param maxLinks the maximum number of links to scrape + * @param maxAnchors the maximum number of URLs to process and store in the anchors property. + * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms * @param timezoneOffset local time zone offset */ @SuppressWarnings("unchecked") - public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) { + public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); assert root != null; this.root = root; - this.maxLinks = maxLinks; this.vocabularyScraper = vocabularyScraper; this.timezoneOffset = timezoneOffset; this.evaluationScores = new Evaluation(); @@ -277,6 +283,19 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.publisher = null; this.breadcrumbs = 0; this.contentSizeLimitExceeded = false; + this.maxAnchorsExceeded = false; + this.maxAnchors = maxAnchors; + } + + /** + * Create an ContentScraper instance + * @param root the document root url + * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store + * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms + * @param timezoneOffset local time zone offset + */ + public ContentScraper(final DigestURL root, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) { + this(root, Integer.MAX_VALUE, maxLinks, vocabularyScraper, timezoneOffset); } @Override @@ -366,7 +385,18 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } - findAbsoluteURLs(b, this.anchors, anchorListeners); + if(!this.maxAnchorsExceeded) { + int maxLinksToDetect = this.maxAnchors - this.anchors.size(); + if(maxLinksToDetect < Integer.MAX_VALUE) { + /* Add one to the anchors limit to detect when the limit is exceeded */ + maxLinksToDetect++; + } + findAbsoluteURLs(b, this.anchors, anchorListeners, maxLinksToDetect); + if(this.anchors.size() > this.maxAnchors) { + this.maxAnchorsExceeded = true; + this.anchors.remove(this.anchors.size() -1); + } + } // append string to content if (!b.isEmpty()) { @@ -890,8 +920,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { * @param anchor anchor to add. Must not be null. */ protected void addAnchor(AnchorURL anchor) { - this.anchors.add(anchor); - this.fireAddAnchor(anchor.toNormalform(false)); + if(this.anchors.size() >= this.maxAnchors) { + this.maxAnchorsExceeded = true; + } else { + this.anchors.add(anchor); + this.fireAddAnchor(anchor.toNormalform(false)); + } } @@ -1095,7 +1129,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } /** - * @return true when a limit on content size scraped has been exceeded + * @return true when the limit on content size scraped has been exceeded */ public boolean isContentSizeLimitExceeded() { return this.contentSizeLimitExceeded; @@ -1108,6 +1142,23 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.contentSizeLimitExceeded = contentSizeLimitExceeded; } + /** + * @return true when the maxAnchors limit has been exceeded + */ + public boolean isMaxAnchorsExceeded() { + return this.maxAnchorsExceeded; + } + + /** + * @return true when at least one limit on content size, anchors number or links number has been exceeded + */ + public boolean isLimitsExceeded() { + return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded() + || this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded() + || this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded() + || this.frames.isLimitExceeded() || this.iframes.isLimitExceeded(); + } + /* DC in html example: diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index ab4a7fa25..f06cf461e 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -28,6 +28,8 @@ import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.nio.charset.Charset; @@ -36,13 +38,15 @@ import java.nio.charset.StandardCharsets; import java.nio.charset.UnsupportedCharsetException; import java.util.LinkedHashMap; +import org.apache.commons.io.IOUtils; + +import com.ibm.icu.text.CharsetDetector; + import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.CommonPattern; -import net.yacy.cora.util.StreamLimitException; -import net.yacy.cora.util.StrictLimitInputStream; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -51,14 +55,11 @@ import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.TransformerWriter; -import net.yacy.kelondro.util.FileUtils; - -import com.ibm.icu.text.CharsetDetector; public class htmlParser extends AbstractParser implements Parser { - /** The default maximum number of links to add to a parsed document */ + /** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */ private static final int DEFAULT_MAX_LINKS = 10000; public htmlParser() { @@ -103,7 +104,7 @@ public class htmlParser extends AbstractParser implements Parser { final int timezoneOffset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { - return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, DEFAULT_MAX_LINKS, Long.MAX_VALUE); + return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE); } @Override @@ -115,10 +116,16 @@ public class htmlParser extends AbstractParser implements Parser { public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper, final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes) throws Failure { + return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes); + } + + private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper, + final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes) + throws Failure { try { // first get a document from the parsed html Charset[] detectedcharsetcontainer = new Charset[]{null}; - ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, maxBytes); + ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes); // parseToScraper also detects/corrects/sets charset from html content tag final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); Document documentSnapshot = null; @@ -127,10 +134,10 @@ public class htmlParser extends AbstractParser implements Parser { // and create a sub-document for snapshot page (which will be merged by loader) // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler if (location.getRef() != null && location.getRef().startsWith("!")) { - documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes); + documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes); } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both) if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) { - documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes); + documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes); } } } catch (Exception ex1) { // ignore any exception for any issue with snapshot @@ -190,12 +197,12 @@ public class htmlParser extends AbstractParser implements Parser { scraper.getDate()); ppd.setScraperObject(scraper); ppd.setIcons(scraper.getIcons()); - ppd.setPartiallyParsed(scraper.isContentSizeLimitExceeded()); + ppd.setPartiallyParsed(scraper.isLimitsExceeded()); return ppd; } - public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxLinks) throws IOException { + public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException { Charset[] detectedcharsetcontainer = new Charset[]{null}; InputStream sourceStream; try { @@ -205,7 +212,7 @@ public class htmlParser extends AbstractParser implements Parser { } ContentScraper scraper; // for this static methode no need to init local this.scraperObject try { - scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, Long.MAX_VALUE); + scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE); } catch (Failure e) { throw new IOException(e.getMessage()); } @@ -220,7 +227,8 @@ public class htmlParser extends AbstractParser implements Parser { * @param detectedcharsetcontainer a mutable array of Charsets : filled with the charset detected when parsing * @param timezoneOffset the local time zone offset * @param sourceStream an open stream on the resource to parse - * @param maxLinks the maximum number of links to store in the scraper + * @param maxAnchors the maximum number of URLs to process and store in the in the scraper's anchors property + * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store in the scraper * @param maxBytes the maximum number of content bytes to process * @return a scraper containing parsed information * @throws Parser.Failure when an error occurred while parsing @@ -233,13 +241,10 @@ public class htmlParser extends AbstractParser implements Parser { final Charset[] detectedcharsetcontainer, final int timezoneOffset, InputStream sourceStream, + final int maxAnchors, final int maxLinks, final long maxBytes) throws Parser.Failure, IOException { - if(maxBytes >= 0 && maxBytes < Long.MAX_VALUE) { - sourceStream = new StrictLimitInputStream(sourceStream, maxBytes); - } - // make a scraper String charset = null; @@ -286,22 +291,24 @@ public class htmlParser extends AbstractParser implements Parser { } // parsing the content - // for this static methode no need to init local this.scraperObject here - final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset); + // for this static method no need to init local this.scraperObject here + final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, vocabularyScraper, timezoneOffset); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); try { - FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]); - } catch(StreamLimitException e) { - /* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */ - scraper.setContentSizeLimitExceeded(true); - } catch (final IOException e) { - /* A StreamLimitException may be itself wrapped in an IOException by a InputStreamReader */ - if(e.getCause() instanceof StreamLimitException) { - /* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */ + final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte()); + final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]); + final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars); + if(copiedChars > maxChars) { + /* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */ scraper.setContentSizeLimitExceeded(true); - } else { - throw new Parser.Failure("IO error:" + e.getMessage(), location); - } + } else if(copiedChars == maxChars) { + /* Exactly maxChars limit reached : let's check if more to read remain. */ + if(sourceReader.read() >= 0) { + scraper.setContentSizeLimitExceeded(true); + } + } + } catch (final IOException e) { + throw new Parser.Failure("IO error:" + e.getMessage(), location); } finally { writer.flush(); //sourceStream.close(); keep open for multipe parsing (close done by caller) @@ -407,12 +414,13 @@ public class htmlParser extends AbstractParser implements Parser { * @param documentCharset * @param vocscraper * @param timezoneOffset + * @param maxAnchors the maximum number of URLs to process and store in the in the scraper's anchors property * @param maxLinks the maximum number of links to store in the document * @param maxBytes the maximum number of content bytes to process * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot */ private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset, - final VocabularyScraper vocscraper, final int timezoneOffset, final int maxLinks, final long maxBytes) { + final VocabularyScraper vocscraper, final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) { Document documentSnapshot = null; try { // construct url for case (1) with anchor @@ -431,7 +439,7 @@ public class htmlParser extends AbstractParser implements Parser { InputStream snapshotStream = null; try { snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); - ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks, maxBytes); + ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes); documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot); } finally { if(snapshotStream != null) { diff --git a/test/java/net/yacy/document/parser/htmlParserTest.java b/test/java/net/yacy/document/parser/htmlParserTest.java index 432dfd0e5..8d0f1a4f9 100644 --- a/test/java/net/yacy/document/parser/htmlParserTest.java +++ b/test/java/net/yacy/document/parser/htmlParserTest.java @@ -1,14 +1,20 @@ package net.yacy.document.parser; +import static net.yacy.document.parser.htmlParser.parseToScraper; + +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Locale; +import org.junit.Test; + import junit.framework.TestCase; import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.Document; @@ -16,8 +22,6 @@ import net.yacy.document.Parser; import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; -import static net.yacy.document.parser.htmlParser.parseToScraper; -import org.junit.Test; public class htmlParserTest extends TestCase { @@ -98,7 +102,118 @@ public class htmlParserTest extends TestCase { } } + + /** + * Test the htmlParser.parseWithLimits() method with test content within bounds. + * @throws Exception when an unexpected error occurred + */ + @Test + public void testParseWithLimitsUnreached() throws Exception { + System.out.println("htmlParser.parse"); + + String[] testFiles = { + "umlaute_html_iso.html", + "umlaute_html_utf8.html", + "umlaute_html_namedentities.html"}; + + final String mimetype = "text/html"; + //final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen."; + + htmlParser parser = new htmlParser(); + for (final String testfile : testFiles) { + final String fileName = "test" + File.separator + "parsertest" + File.separator + testfile; + final File file = new File(fileName); + + final AnchorURL url = new AnchorURL("http://localhost/" + fileName); + + try (final FileInputStream inStream = new FileInputStream(file);) { + + final Document[] docs = parser.parseWithLimits(url, mimetype, null, new VocabularyScraper(), 0, inStream, 1000, 10000); + final Document doc = docs[0]; + assertNotNull("Parser result must not be null for file " + fileName, docs); + final String parsedText = doc.getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertEquals("Test anchor must have been parsed for file " + fileName, 1, doc.getAnchors().size()); + assertFalse("Parsed document should not be marked as partially parsed for file " + fileName, doc.isPartiallyParsed()); + + } + } + } + + /** + * Test the htmlParser.parseWithLimits() method, with various maxLinks values + * ranging from zero to the exact anchors number contained in the test content. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseWithLimitsOnAnchors() throws Exception { + final AnchorURL url = new AnchorURL("http://localhost/test.html"); + final String mimetype = "text/html"; + final String charset = StandardCharsets.UTF_8.name(); + final StringBuilder testHtml = new StringBuilder("

"); + testHtml.append("First link"); + testHtml.append("Second link"); + testHtml.append("Third link"); + testHtml.append("

"); + final htmlParser parser = new htmlParser(); + + for (int maxLinks = 0; maxLinks <= 3; maxLinks++) { + try (InputStream sourceStream = new ByteArrayInputStream( + testHtml.toString().getBytes(StandardCharsets.UTF_8));) { + final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0, + sourceStream, maxLinks, Long.MAX_VALUE); + final Document doc = docs[0]; + assertEquals(maxLinks, doc.getAnchors().size()); + assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded", + maxLinks < 3, doc.isPartiallyParsed()); + } + } + } + + /** + * Test the htmlParser.parseWithLimits() method, with various maxLinks values + * ranging from zero the exact RSS feed links number contained in the test + * content. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseWithLimitsOnRSSFeeds() throws Exception { + final AnchorURL url = new AnchorURL("http://localhost/test.html"); + final String mimetype = "text/html"; + final String charset = StandardCharsets.UTF_8.name(); + final StringBuilder testHtml = new StringBuilder(""); + testHtml.append(""); + testHtml.append( + ""); + testHtml.append( + ""); + testHtml.append( + ""); + testHtml.append(""); + testHtml.append("

HTML test content

"); + + final htmlParser parser = new htmlParser(); + + for (int maxLinks = 0; maxLinks <= 3; maxLinks++) { + try (InputStream sourceStream = new ByteArrayInputStream( + testHtml.toString().getBytes(StandardCharsets.UTF_8));) { + final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0, + sourceStream, maxLinks, Long.MAX_VALUE); + final Document doc = docs[0]; + assertEquals(maxLinks, doc.getRSS().size()); + assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded", + maxLinks < 3, doc.isPartiallyParsed()); + } + } + } + /** * Test of parseToScraper method, of class htmlParser. */ @@ -115,7 +230,7 @@ public class htmlParserTest extends TestCase { + "
\"image" // + img width 550 (+html5 figure) + ""; - ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10); + ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10); List anchorlist = scraper.getAnchors(); String linktxt = anchorlist.get(0).getTextProperty(); @@ -157,7 +272,7 @@ public class htmlParserTest extends TestCase { } testHtml.append("

"); - ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), 10); + ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), Integer.MAX_VALUE, Integer.MAX_VALUE); assertEquals(nestingDepth, scraper.getAnchors().size()); assertEquals(1, scraper.getImages().size()); @@ -178,7 +293,7 @@ public class htmlParserTest extends TestCase { + "

" + textSource + "

" + ""; - ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10); + ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10); String txt = scraper.getText(); System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]"); @@ -207,7 +322,7 @@ public class htmlParserTest extends TestCase { + "\n" + "" + textSource + "\n" + ""; - ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10); + ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10); String txt = scraper.getText(); System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]"); diff --git a/test/parsertest/umlaute_html_namedentities.html b/test/parsertest/umlaute_html_namedentities.html index 02f91f84a..2f9f8b9cc 100644 --- a/test/parsertest/umlaute_html_namedentities.html +++ b/test/parsertest/umlaute_html_namedentities.html @@ -6,5 +6,6 @@ In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.
+Example link in HTML with named entities