Finer control on max links to parse in the html parser.

pull/93/merge
luccioman 8 years ago
parent 4743a104b5
commit 169ffdd1c7

@ -28,13 +28,28 @@ public class SizeLimitedMap<K, V> extends LinkedHashMap<K, V> implements Map<K,
private static final long serialVersionUID = 6088727126150060068L;
final int sizeLimit;
private final int sizeLimit;
/** Set to true when at least one eldest entry has been removed because the map size exceeded the size limit. */
private boolean limitExceeded;
public SizeLimitedMap(int sizeLimit) {
this.sizeLimit = sizeLimit;
this.limitExceeded = false;
}
@Override protected boolean removeEldestEntry(final Map.Entry<K, V> eldest) {
return size() > this.sizeLimit;
boolean res = size() > this.sizeLimit;
if(res) {
this.limitExceeded = true;
}
return res;
}
/**
* @return true when the size limit has been exceeded at least one time
*/
public boolean isLimitExceeded() {
return this.limitExceeded;
}
}

@ -71,6 +71,13 @@ public class SizeLimitedSet<E> extends AbstractSet<E> implements Set<E>, Cloneab
public void clear() {
map.clear();
}
/**
* @return true when the size limit has been exceeded at least one time
*/
public boolean isLimitExceeded() {
return this.map.isLimitExceeded();
}
@Override
@SuppressWarnings("unchecked")

@ -187,12 +187,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// class variables: collectors for links
private final List<AnchorURL> anchors;
private final LinkedHashMap<DigestURL, String> rss, css;
private final LinkedHashMap<AnchorURL, EmbedEntry> embeds; // urlhash/embed relation
private final SizeLimitedMap<DigestURL, String> rss, css;
private final SizeLimitedMap<AnchorURL, EmbedEntry> embeds; // urlhash/embed relation
private final List<ImageEntry> images;
private final Set<AnchorURL> script, frames, iframes;
private final Map<String, String> metas;
private final Map<String, DigestURL> hreflang, navigation;
private final SizeLimitedSet<AnchorURL> script, frames, iframes;
private final SizeLimitedMap<String, String> metas;
private final SizeLimitedMap<String, DigestURL> hreflang, navigation;
private LinkedHashSet<String> titles;
private final List<String> articles;
private final List<Date> startDates, endDates;
@ -204,7 +204,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final EventListenerList htmlFilterEventListeners;
private double lon, lat;
private AnchorURL canonical, publisher;
private final int maxLinks;
/** The maximum number of URLs to process and store in the anchors property. */
private final int maxAnchors;
private final VocabularyScraper vocabularyScraper;
private final int timezoneOffset;
private int breadcrumbs;
@ -226,21 +229,24 @@ public class ContentScraper extends AbstractScraper implements Scraper {
/** Set to true when a limit on content size scraped has been exceeded */
private boolean contentSizeLimitExceeded;
/** Set to true when the maxAnchors limit has been exceeded */
private boolean maxAnchorsExceeded;
/**
* scrape a document
* Create an ContentScraper instance
* @param root the document root url
* @param maxLinks the maximum number of links to scrape
* @param maxAnchors the maximum number of URLs to process and store in the anchors property.
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset
*/
@SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
assert root != null;
this.root = root;
this.maxLinks = maxLinks;
this.vocabularyScraper = vocabularyScraper;
this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation();
@ -277,6 +283,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.publisher = null;
this.breadcrumbs = 0;
this.contentSizeLimitExceeded = false;
this.maxAnchorsExceeded = false;
this.maxAnchors = maxAnchors;
}
/**
* Create an ContentScraper instance
* @param root the document root url
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset
*/
public ContentScraper(final DigestURL root, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, vocabularyScraper, timezoneOffset);
}
@Override
@ -366,7 +385,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
findAbsoluteURLs(b, this.anchors, anchorListeners);
if(!this.maxAnchorsExceeded) {
int maxLinksToDetect = this.maxAnchors - this.anchors.size();
if(maxLinksToDetect < Integer.MAX_VALUE) {
/* Add one to the anchors limit to detect when the limit is exceeded */
maxLinksToDetect++;
}
findAbsoluteURLs(b, this.anchors, anchorListeners, maxLinksToDetect);
if(this.anchors.size() > this.maxAnchors) {
this.maxAnchorsExceeded = true;
this.anchors.remove(this.anchors.size() -1);
}
}
// append string to content
if (!b.isEmpty()) {
@ -890,8 +920,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param anchor anchor to add. Must not be null.
*/
protected void addAnchor(AnchorURL anchor) {
this.anchors.add(anchor);
this.fireAddAnchor(anchor.toNormalform(false));
if(this.anchors.size() >= this.maxAnchors) {
this.maxAnchorsExceeded = true;
} else {
this.anchors.add(anchor);
this.fireAddAnchor(anchor.toNormalform(false));
}
}
@ -1095,7 +1129,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
/**
* @return true when a limit on content size scraped has been exceeded
* @return true when the limit on content size scraped has been exceeded
*/
public boolean isContentSizeLimitExceeded() {
return this.contentSizeLimitExceeded;
@ -1108,6 +1142,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.contentSizeLimitExceeded = contentSizeLimitExceeded;
}
/**
* @return true when the maxAnchors limit has been exceeded
*/
public boolean isMaxAnchorsExceeded() {
return this.maxAnchorsExceeded;
}
/**
* @return true when at least one limit on content size, anchors number or links number has been exceeded
*/
public boolean isLimitsExceeded() {
return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded()
|| this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded()
|| this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded()
|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded();
}
/*
DC in html example:
<meta name="DC.title" lang="en" content="Expressing Dublin Core in HTML/XHTML meta and link elements" />

@ -28,6 +28,8 @@ import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
@ -36,13 +38,15 @@ import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.LinkedHashMap;
import org.apache.commons.io.IOUtils;
import com.ibm.icu.text.CharsetDetector;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.StreamLimitException;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -51,14 +55,11 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils;
import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser {
/** The default maximum number of links to add to a parsed document */
/** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */
private static final int DEFAULT_MAX_LINKS = 10000;
public htmlParser() {
@ -103,7 +104,7 @@ public class htmlParser extends AbstractParser implements Parser {
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
}
@Override
@ -115,10 +116,16 @@ public class htmlParser extends AbstractParser implements Parser {
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
throws Failure {
return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
}
private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes)
throws Failure {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, maxBytes);
ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null;
@ -127,10 +134,10 @@ public class htmlParser extends AbstractParser implements Parser {
// and create a sub-document for snapshot page (which will be merged by loader)
// TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
if (location.getRef() != null && location.getRef().startsWith("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
} else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes);
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
}
}
} catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -190,12 +197,12 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getDate());
ppd.setScraperObject(scraper);
ppd.setIcons(scraper.getIcons());
ppd.setPartiallyParsed(scraper.isContentSizeLimitExceeded());
ppd.setPartiallyParsed(scraper.isLimitsExceeded());
return ppd;
}
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxLinks) throws IOException {
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream;
try {
@ -205,7 +212,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try {
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, Long.MAX_VALUE);
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
} catch (Failure e) {
throw new IOException(e.getMessage());
}
@ -220,7 +227,8 @@ public class htmlParser extends AbstractParser implements Parser {
* @param detectedcharsetcontainer a mutable array of Charsets : filled with the charset detected when parsing
* @param timezoneOffset the local time zone offset
* @param sourceStream an open stream on the resource to parse
* @param maxLinks the maximum number of links to store in the scraper
* @param maxAnchors the maximum number of URLs to process and store in the in the scraper's anchors property
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store in the scraper
* @param maxBytes the maximum number of content bytes to process
* @return a scraper containing parsed information
* @throws Parser.Failure when an error occurred while parsing
@ -233,13 +241,10 @@ public class htmlParser extends AbstractParser implements Parser {
final Charset[] detectedcharsetcontainer,
final int timezoneOffset,
InputStream sourceStream,
final int maxAnchors,
final int maxLinks,
final long maxBytes) throws Parser.Failure, IOException {
if(maxBytes >= 0 && maxBytes < Long.MAX_VALUE) {
sourceStream = new StrictLimitInputStream(sourceStream, maxBytes);
}
// make a scraper
String charset = null;
@ -286,22 +291,24 @@ public class htmlParser extends AbstractParser implements Parser {
}
// parsing the content
// for this static methode no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset);
// for this static method no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, vocabularyScraper, timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
} catch(StreamLimitException e) {
/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
scraper.setContentSizeLimitExceeded(true);
} catch (final IOException e) {
/* A StreamLimitException may be itself wrapped in an IOException by a InputStreamReader */
if(e.getCause() instanceof StreamLimitException) {
/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);
final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars);
if(copiedChars > maxChars) {
/* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */
scraper.setContentSizeLimitExceeded(true);
} else {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
}
} else if(copiedChars == maxChars) {
/* Exactly maxChars limit reached : let's check if more to read remain. */
if(sourceReader.read() >= 0) {
scraper.setContentSizeLimitExceeded(true);
}
}
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {
writer.flush();
//sourceStream.close(); keep open for multipe parsing (close done by caller)
@ -407,12 +414,13 @@ public class htmlParser extends AbstractParser implements Parser {
* @param documentCharset
* @param vocscraper
* @param timezoneOffset
* @param maxAnchors the maximum number of URLs to process and store in the in the scraper's anchors property
* @param maxLinks the maximum number of links to store in the document
* @param maxBytes the maximum number of content bytes to process
* @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
*/
private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset,
final VocabularyScraper vocscraper, final int timezoneOffset, final int maxLinks, final long maxBytes) {
final VocabularyScraper vocscraper, final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
Document documentSnapshot = null;
try {
// construct url for case (1) with anchor
@ -431,7 +439,7 @@ public class htmlParser extends AbstractParser implements Parser {
InputStream snapshotStream = null;
try {
snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks, maxBytes);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
} finally {
if(snapshotStream != null) {

@ -1,14 +1,20 @@
package net.yacy.document.parser;
import static net.yacy.document.parser.htmlParser.parseToScraper;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Locale;
import org.junit.Test;
import junit.framework.TestCase;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.Document;
@ -16,8 +22,6 @@ import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import static net.yacy.document.parser.htmlParser.parseToScraper;
import org.junit.Test;
public class htmlParserTest extends TestCase {
@ -98,7 +102,118 @@ public class htmlParserTest extends TestCase {
}
}
/**
* Test the htmlParser.parseWithLimits() method with test content within bounds.
* @throws Exception when an unexpected error occurred
*/
@Test
public void testParseWithLimitsUnreached() throws Exception {
System.out.println("htmlParser.parse");
String[] testFiles = {
"umlaute_html_iso.html",
"umlaute_html_utf8.html",
"umlaute_html_namedentities.html"};
final String mimetype = "text/html";
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
htmlParser parser = new htmlParser();
for (final String testfile : testFiles) {
final String fileName = "test" + File.separator + "parsertest" + File.separator + testfile;
final File file = new File(fileName);
final AnchorURL url = new AnchorURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(file);) {
final Document[] docs = parser.parseWithLimits(url, mimetype, null, new VocabularyScraper(), 0, inStream, 1000, 10000);
final Document doc = docs[0];
assertNotNull("Parser result must not be null for file " + fileName, docs);
final String parsedText = doc.getTextString();
assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
parsedText.contains("Maßkrügen"));
assertEquals("Test anchor must have been parsed for file " + fileName, 1, doc.getAnchors().size());
assertFalse("Parsed document should not be marked as partially parsed for file " + fileName, doc.isPartiallyParsed());
}
}
}
/**
* Test the htmlParser.parseWithLimits() method, with various maxLinks values
* ranging from zero to the exact anchors number contained in the test content.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseWithLimitsOnAnchors() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final String charset = StandardCharsets.UTF_8.name();
final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
testHtml.append("<a href=\"http://localhost/doc1.html\">First link</a>");
testHtml.append("<a href=\"http://localhost/doc2.html\">Second link</a>");
testHtml.append("<a href=\"http://localhost/doc3.html\">Third link</a>");
testHtml.append("</p></body></html>");
final htmlParser parser = new htmlParser();
for (int maxLinks = 0; maxLinks <= 3; maxLinks++) {
try (InputStream sourceStream = new ByteArrayInputStream(
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0,
sourceStream, maxLinks, Long.MAX_VALUE);
final Document doc = docs[0];
assertEquals(maxLinks, doc.getAnchors().size());
assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded",
maxLinks < 3, doc.isPartiallyParsed());
}
}
}
/**
* Test the htmlParser.parseWithLimits() method, with various maxLinks values
* ranging from zero the exact RSS feed links number contained in the test
* content.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseWithLimitsOnRSSFeeds() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final String charset = StandardCharsets.UTF_8.name();
final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html>");
testHtml.append("<head>");
testHtml.append(
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Feed1\" href=\"http://localhost/rss1.xml\" />");
testHtml.append(
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Feed2\" href=\"http://localhost/rss2.xml\" />");
testHtml.append(
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Feed3\" href=\"http://localhost/rss3.xml\" />");
testHtml.append("</head>");
testHtml.append("<body><p>HTML test content</p></body></html>");
final htmlParser parser = new htmlParser();
for (int maxLinks = 0; maxLinks <= 3; maxLinks++) {
try (InputStream sourceStream = new ByteArrayInputStream(
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0,
sourceStream, maxLinks, Long.MAX_VALUE);
final Document doc = docs[0];
assertEquals(maxLinks, doc.getRSS().size());
assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded",
maxLinks < 3, doc.isPartiallyParsed());
}
}
}
/**
* Test of parseToScraper method, of class htmlParser.
*/
@ -115,7 +230,7 @@ public class htmlParserTest extends TestCase {
+ "<figure><img width=\"550px\" title=\"image as exemple\" alt=\"image as exemple\" src=\"./img/my_image.png\"></figrue>" // + img width 550 (+html5 figure)
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10);
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
List<AnchorURL> anchorlist = scraper.getAnchors();
String linktxt = anchorlist.get(0).getTextProperty();
@ -157,7 +272,7 @@ public class htmlParserTest extends TestCase {
}
testHtml.append("</p></body></html>");
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), 10);
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), Integer.MAX_VALUE, Integer.MAX_VALUE);
assertEquals(nestingDepth, scraper.getAnchors().size());
assertEquals(1, scraper.getImages().size());
@ -178,7 +293,7 @@ public class htmlParserTest extends TestCase {
+ "<p>" + textSource + "</p>"
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10);
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
String txt = scraper.getText();
System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
@ -207,7 +322,7 @@ public class htmlParserTest extends TestCase {
+ "</head>\n"
+ "<body>" + textSource + "</body>\n"
+ "</html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10);
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
String txt = scraper.getText();
System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]");

@ -6,5 +6,6 @@
<body>
In M&uuml;nchen steht ein Hofbr&auml;uhaus.
Dort gibt es Bier aus Ma&szlig;kr&uuml;gen.<br>
<a href="http://localhost/umlaute_html_namedentities.html">Example link in HTML with named entities</a>
</body>
</html>

Loading…
Cancel
Save