Finer control on max links to parse in the html parser.

pull/93/merge
luccioman 8 years ago
parent 4743a104b5
commit 169ffdd1c7

@ -28,13 +28,28 @@ public class SizeLimitedMap<K, V> extends LinkedHashMap<K, V> implements Map<K,
private static final long serialVersionUID = 6088727126150060068L; private static final long serialVersionUID = 6088727126150060068L;
final int sizeLimit; private final int sizeLimit;
/** Set to true when at least one eldest entry has been removed because the map size exceeded the size limit. */
private boolean limitExceeded;
public SizeLimitedMap(int sizeLimit) { public SizeLimitedMap(int sizeLimit) {
this.sizeLimit = sizeLimit; this.sizeLimit = sizeLimit;
this.limitExceeded = false;
} }
@Override protected boolean removeEldestEntry(final Map.Entry<K, V> eldest) { @Override protected boolean removeEldestEntry(final Map.Entry<K, V> eldest) {
return size() > this.sizeLimit; boolean res = size() > this.sizeLimit;
if(res) {
this.limitExceeded = true;
}
return res;
}
/**
* @return true when the size limit has been exceeded at least one time
*/
public boolean isLimitExceeded() {
return this.limitExceeded;
} }
} }

@ -72,6 +72,13 @@ public class SizeLimitedSet<E> extends AbstractSet<E> implements Set<E>, Cloneab
map.clear(); map.clear();
} }
/**
* @return true when the size limit has been exceeded at least one time
*/
public boolean isLimitExceeded() {
return this.map.isLimitExceeded();
}
@Override @Override
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public Object clone() { public Object clone() {

@ -187,12 +187,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// class variables: collectors for links // class variables: collectors for links
private final List<AnchorURL> anchors; private final List<AnchorURL> anchors;
private final LinkedHashMap<DigestURL, String> rss, css; private final SizeLimitedMap<DigestURL, String> rss, css;
private final LinkedHashMap<AnchorURL, EmbedEntry> embeds; // urlhash/embed relation private final SizeLimitedMap<AnchorURL, EmbedEntry> embeds; // urlhash/embed relation
private final List<ImageEntry> images; private final List<ImageEntry> images;
private final Set<AnchorURL> script, frames, iframes; private final SizeLimitedSet<AnchorURL> script, frames, iframes;
private final Map<String, String> metas; private final SizeLimitedMap<String, String> metas;
private final Map<String, DigestURL> hreflang, navigation; private final SizeLimitedMap<String, DigestURL> hreflang, navigation;
private LinkedHashSet<String> titles; private LinkedHashSet<String> titles;
private final List<String> articles; private final List<String> articles;
private final List<Date> startDates, endDates; private final List<Date> startDates, endDates;
@ -204,7 +204,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final EventListenerList htmlFilterEventListeners; private final EventListenerList htmlFilterEventListeners;
private double lon, lat; private double lon, lat;
private AnchorURL canonical, publisher; private AnchorURL canonical, publisher;
private final int maxLinks;
/** The maximum number of URLs to process and store in the anchors property. */
private final int maxAnchors;
private final VocabularyScraper vocabularyScraper; private final VocabularyScraper vocabularyScraper;
private final int timezoneOffset; private final int timezoneOffset;
private int breadcrumbs; private int breadcrumbs;
@ -226,21 +229,24 @@ public class ContentScraper extends AbstractScraper implements Scraper {
/** Set to true when a limit on content size scraped has been exceeded */ /** Set to true when a limit on content size scraped has been exceeded */
private boolean contentSizeLimitExceeded; private boolean contentSizeLimitExceeded;
/** Set to true when the maxAnchors limit has been exceeded */
private boolean maxAnchorsExceeded;
/** /**
* scrape a document * Create an ContentScraper instance
* @param root the document root url * @param root the document root url
* @param maxLinks the maximum number of links to scrape * @param maxAnchors the maximum number of URLs to process and store in the anchors property.
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset * @param timezoneOffset local time zone offset
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) { public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
// the root value here will not be used to load the resource. // the root value here will not be used to load the resource.
// it is only the reference for relative links // it is only the reference for relative links
super(linkTags0, linkTags1); super(linkTags0, linkTags1);
assert root != null; assert root != null;
this.root = root; this.root = root;
this.maxLinks = maxLinks;
this.vocabularyScraper = vocabularyScraper; this.vocabularyScraper = vocabularyScraper;
this.timezoneOffset = timezoneOffset; this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation(); this.evaluationScores = new Evaluation();
@ -277,6 +283,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.publisher = null; this.publisher = null;
this.breadcrumbs = 0; this.breadcrumbs = 0;
this.contentSizeLimitExceeded = false; this.contentSizeLimitExceeded = false;
this.maxAnchorsExceeded = false;
this.maxAnchors = maxAnchors;
}
/**
* Create an ContentScraper instance
* @param root the document root url
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset
*/
public ContentScraper(final DigestURL root, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, vocabularyScraper, timezoneOffset);
} }
@Override @Override
@ -366,7 +385,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
} }
findAbsoluteURLs(b, this.anchors, anchorListeners); if(!this.maxAnchorsExceeded) {
int maxLinksToDetect = this.maxAnchors - this.anchors.size();
if(maxLinksToDetect < Integer.MAX_VALUE) {
/* Add one to the anchors limit to detect when the limit is exceeded */
maxLinksToDetect++;
}
findAbsoluteURLs(b, this.anchors, anchorListeners, maxLinksToDetect);
if(this.anchors.size() > this.maxAnchors) {
this.maxAnchorsExceeded = true;
this.anchors.remove(this.anchors.size() -1);
}
}
// append string to content // append string to content
if (!b.isEmpty()) { if (!b.isEmpty()) {
@ -890,9 +920,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param anchor anchor to add. Must not be null. * @param anchor anchor to add. Must not be null.
*/ */
protected void addAnchor(AnchorURL anchor) { protected void addAnchor(AnchorURL anchor) {
if(this.anchors.size() >= this.maxAnchors) {
this.maxAnchorsExceeded = true;
} else {
this.anchors.add(anchor); this.anchors.add(anchor);
this.fireAddAnchor(anchor.toNormalform(false)); this.fireAddAnchor(anchor.toNormalform(false));
} }
}
@Override @Override
@ -1095,7 +1129,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
/** /**
* @return true when a limit on content size scraped has been exceeded * @return true when the limit on content size scraped has been exceeded
*/ */
public boolean isContentSizeLimitExceeded() { public boolean isContentSizeLimitExceeded() {
return this.contentSizeLimitExceeded; return this.contentSizeLimitExceeded;
@ -1108,6 +1142,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.contentSizeLimitExceeded = contentSizeLimitExceeded; this.contentSizeLimitExceeded = contentSizeLimitExceeded;
} }
/**
* @return true when the maxAnchors limit has been exceeded
*/
public boolean isMaxAnchorsExceeded() {
return this.maxAnchorsExceeded;
}
/**
* @return true when at least one limit on content size, anchors number or links number has been exceeded
*/
public boolean isLimitsExceeded() {
return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded()
|| this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded()
|| this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded()
|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded();
}
/* /*
DC in html example: DC in html example:
<meta name="DC.title" lang="en" content="Expressing Dublin Core in HTML/XHTML meta and link elements" /> <meta name="DC.title" lang="en" content="Expressing Dublin Core in HTML/XHTML meta and link elements" />

@ -28,6 +28,8 @@ import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
@ -36,13 +38,15 @@ import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException; import java.nio.charset.UnsupportedCharsetException;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import org.apache.commons.io.IOUtils;
import com.ibm.icu.text.CharsetDetector;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.StreamLimitException;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -51,14 +55,11 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter; import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils;
import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser { public class htmlParser extends AbstractParser implements Parser {
/** The default maximum number of links to add to a parsed document */ /** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */
private static final int DEFAULT_MAX_LINKS = 10000; private static final int DEFAULT_MAX_LINKS = 10000;
public htmlParser() { public htmlParser() {
@ -103,7 +104,7 @@ public class htmlParser extends AbstractParser implements Parser {
final int timezoneOffset, final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, DEFAULT_MAX_LINKS, Long.MAX_VALUE); return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
} }
@Override @Override
@ -115,10 +116,16 @@ public class htmlParser extends AbstractParser implements Parser {
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper, public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes) final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
throws Failure { throws Failure {
return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
}
private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes)
throws Failure {
try { try {
// first get a document from the parsed html // first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, maxBytes); ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
// parseToScraper also detects/corrects/sets charset from html content tag // parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null; Document documentSnapshot = null;
@ -127,10 +134,10 @@ public class htmlParser extends AbstractParser implements Parser {
// and create a sub-document for snapshot page (which will be merged by loader) // and create a sub-document for snapshot page (which will be merged by loader)
// TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
if (location.getRef() != null && location.getRef().startsWith("!")) { if (location.getRef() != null && location.getRef().startsWith("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes); documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
} else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both) } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) { if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxLinks, maxBytes); documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
} }
} }
} catch (Exception ex1) { // ignore any exception for any issue with snapshot } catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -190,12 +197,12 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getDate()); scraper.getDate());
ppd.setScraperObject(scraper); ppd.setScraperObject(scraper);
ppd.setIcons(scraper.getIcons()); ppd.setIcons(scraper.getIcons());
ppd.setPartiallyParsed(scraper.isContentSizeLimitExceeded()); ppd.setPartiallyParsed(scraper.isLimitsExceeded());
return ppd; return ppd;
} }
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxLinks) throws IOException { public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream; InputStream sourceStream;
try { try {
@ -205,7 +212,7 @@ public class htmlParser extends AbstractParser implements Parser {
} }
ContentScraper scraper; // for this static methode no need to init local this.scraperObject ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try { try {
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks, Long.MAX_VALUE); scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
} catch (Failure e) { } catch (Failure e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} }
@ -220,7 +227,8 @@ public class htmlParser extends AbstractParser implements Parser {
* @param detectedcharsetcontainer a mutable array of Charsets : filled with the charset detected when parsing * @param detectedcharsetcontainer a mutable array of Charsets : filled with the charset detected when parsing
* @param timezoneOffset the local time zone offset * @param timezoneOffset the local time zone offset
* @param sourceStream an open stream on the resource to parse * @param sourceStream an open stream on the resource to parse
* @param maxLinks the maximum number of links to store in the scraper * @param maxAnchors the maximum number of URLs to process and store in the in the scraper's anchors property
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store in the scraper
* @param maxBytes the maximum number of content bytes to process * @param maxBytes the maximum number of content bytes to process
* @return a scraper containing parsed information * @return a scraper containing parsed information
* @throws Parser.Failure when an error occurred while parsing * @throws Parser.Failure when an error occurred while parsing
@ -233,13 +241,10 @@ public class htmlParser extends AbstractParser implements Parser {
final Charset[] detectedcharsetcontainer, final Charset[] detectedcharsetcontainer,
final int timezoneOffset, final int timezoneOffset,
InputStream sourceStream, InputStream sourceStream,
final int maxAnchors,
final int maxLinks, final int maxLinks,
final long maxBytes) throws Parser.Failure, IOException { final long maxBytes) throws Parser.Failure, IOException {
if(maxBytes >= 0 && maxBytes < Long.MAX_VALUE) {
sourceStream = new StrictLimitInputStream(sourceStream, maxBytes);
}
// make a scraper // make a scraper
String charset = null; String charset = null;
@ -286,22 +291,24 @@ public class htmlParser extends AbstractParser implements Parser {
} }
// parsing the content // parsing the content
// for this static methode no need to init local this.scraperObject here // for this static method no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset); final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, vocabularyScraper, timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try { try {
FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]); final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
} catch(StreamLimitException e) { final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);
/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */ final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars);
if(copiedChars > maxChars) {
/* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */
scraper.setContentSizeLimitExceeded(true); scraper.setContentSizeLimitExceeded(true);
} catch (final IOException e) { } else if(copiedChars == maxChars) {
/* A StreamLimitException may be itself wrapped in an IOException by a InputStreamReader */ /* Exactly maxChars limit reached : let's check if more to read remain. */
if(e.getCause() instanceof StreamLimitException) { if(sourceReader.read() >= 0) {
/* maxBytes limit has been reached : do not fail here as we want to use the partially obtained results. */
scraper.setContentSizeLimitExceeded(true); scraper.setContentSizeLimitExceeded(true);
} else {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} }
}
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally { } finally {
writer.flush(); writer.flush();
//sourceStream.close(); keep open for multipe parsing (close done by caller) //sourceStream.close(); keep open for multipe parsing (close done by caller)
@ -407,12 +414,13 @@ public class htmlParser extends AbstractParser implements Parser {
* @param documentCharset * @param documentCharset
* @param vocscraper * @param vocscraper
* @param timezoneOffset * @param timezoneOffset
* @param maxAnchors the maximum number of URLs to process and store in the in the scraper's anchors property
* @param maxLinks the maximum number of links to store in the document * @param maxLinks the maximum number of links to store in the document
* @param maxBytes the maximum number of content bytes to process * @param maxBytes the maximum number of content bytes to process
* @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
*/ */
private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset, private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset,
final VocabularyScraper vocscraper, final int timezoneOffset, final int maxLinks, final long maxBytes) { final VocabularyScraper vocscraper, final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
Document documentSnapshot = null; Document documentSnapshot = null;
try { try {
// construct url for case (1) with anchor // construct url for case (1) with anchor
@ -431,7 +439,7 @@ public class htmlParser extends AbstractParser implements Parser {
InputStream snapshotStream = null; InputStream snapshotStream = null;
try { try {
snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks, maxBytes); ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot); documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
} finally { } finally {
if(snapshotStream != null) { if(snapshotStream != null) {

@ -1,14 +1,20 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import static net.yacy.document.parser.htmlParser.parseToScraper;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import org.junit.Test;
import junit.framework.TestCase; import junit.framework.TestCase;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.Document; import net.yacy.document.Document;
@ -16,8 +22,6 @@ import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import static net.yacy.document.parser.htmlParser.parseToScraper;
import org.junit.Test;
public class htmlParserTest extends TestCase { public class htmlParserTest extends TestCase {
@ -99,6 +103,117 @@ public class htmlParserTest extends TestCase {
} }
} }
/**
* Test the htmlParser.parseWithLimits() method with test content within bounds.
* @throws Exception when an unexpected error occurred
*/
@Test
public void testParseWithLimitsUnreached() throws Exception {
System.out.println("htmlParser.parse");
String[] testFiles = {
"umlaute_html_iso.html",
"umlaute_html_utf8.html",
"umlaute_html_namedentities.html"};
final String mimetype = "text/html";
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
htmlParser parser = new htmlParser();
for (final String testfile : testFiles) {
final String fileName = "test" + File.separator + "parsertest" + File.separator + testfile;
final File file = new File(fileName);
final AnchorURL url = new AnchorURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(file);) {
final Document[] docs = parser.parseWithLimits(url, mimetype, null, new VocabularyScraper(), 0, inStream, 1000, 10000);
final Document doc = docs[0];
assertNotNull("Parser result must not be null for file " + fileName, docs);
final String parsedText = doc.getTextString();
assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
parsedText.contains("Maßkrügen"));
assertEquals("Test anchor must have been parsed for file " + fileName, 1, doc.getAnchors().size());
assertFalse("Parsed document should not be marked as partially parsed for file " + fileName, doc.isPartiallyParsed());
}
}
}
/**
* Test the htmlParser.parseWithLimits() method, with various maxLinks values
* ranging from zero to the exact anchors number contained in the test content.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseWithLimitsOnAnchors() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final String charset = StandardCharsets.UTF_8.name();
final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
testHtml.append("<a href=\"http://localhost/doc1.html\">First link</a>");
testHtml.append("<a href=\"http://localhost/doc2.html\">Second link</a>");
testHtml.append("<a href=\"http://localhost/doc3.html\">Third link</a>");
testHtml.append("</p></body></html>");
final htmlParser parser = new htmlParser();
for (int maxLinks = 0; maxLinks <= 3; maxLinks++) {
try (InputStream sourceStream = new ByteArrayInputStream(
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0,
sourceStream, maxLinks, Long.MAX_VALUE);
final Document doc = docs[0];
assertEquals(maxLinks, doc.getAnchors().size());
assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded",
maxLinks < 3, doc.isPartiallyParsed());
}
}
}
/**
* Test the htmlParser.parseWithLimits() method, with various maxLinks values
* ranging from zero the exact RSS feed links number contained in the test
* content.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseWithLimitsOnRSSFeeds() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final String charset = StandardCharsets.UTF_8.name();
final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html>");
testHtml.append("<head>");
testHtml.append(
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Feed1\" href=\"http://localhost/rss1.xml\" />");
testHtml.append(
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Feed2\" href=\"http://localhost/rss2.xml\" />");
testHtml.append(
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Feed3\" href=\"http://localhost/rss3.xml\" />");
testHtml.append("</head>");
testHtml.append("<body><p>HTML test content</p></body></html>");
final htmlParser parser = new htmlParser();
for (int maxLinks = 0; maxLinks <= 3; maxLinks++) {
try (InputStream sourceStream = new ByteArrayInputStream(
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0,
sourceStream, maxLinks, Long.MAX_VALUE);
final Document doc = docs[0];
assertEquals(maxLinks, doc.getRSS().size());
assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded",
maxLinks < 3, doc.isPartiallyParsed());
}
}
}
/** /**
* Test of parseToScraper method, of class htmlParser. * Test of parseToScraper method, of class htmlParser.
*/ */
@ -115,7 +230,7 @@ public class htmlParserTest extends TestCase {
+ "<figure><img width=\"550px\" title=\"image as exemple\" alt=\"image as exemple\" src=\"./img/my_image.png\"></figrue>" // + img width 550 (+html5 figure) + "<figure><img width=\"550px\" title=\"image as exemple\" alt=\"image as exemple\" src=\"./img/my_image.png\"></figrue>" // + img width 550 (+html5 figure)
+ "</body></html>"; + "</body></html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10); ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
List<AnchorURL> anchorlist = scraper.getAnchors(); List<AnchorURL> anchorlist = scraper.getAnchors();
String linktxt = anchorlist.get(0).getTextProperty(); String linktxt = anchorlist.get(0).getTextProperty();
@ -157,7 +272,7 @@ public class htmlParserTest extends TestCase {
} }
testHtml.append("</p></body></html>"); testHtml.append("</p></body></html>");
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), 10); ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), Integer.MAX_VALUE, Integer.MAX_VALUE);
assertEquals(nestingDepth, scraper.getAnchors().size()); assertEquals(nestingDepth, scraper.getAnchors().size());
assertEquals(1, scraper.getImages().size()); assertEquals(1, scraper.getImages().size());
@ -178,7 +293,7 @@ public class htmlParserTest extends TestCase {
+ "<p>" + textSource + "</p>" + "<p>" + textSource + "</p>"
+ "</body></html>"; + "</body></html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10); ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
String txt = scraper.getText(); String txt = scraper.getText();
System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]"); System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
@ -207,7 +322,7 @@ public class htmlParserTest extends TestCase {
+ "</head>\n" + "</head>\n"
+ "<body>" + textSource + "</body>\n" + "<body>" + textSource + "</body>\n"
+ "</html>"; + "</html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10); ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
String txt = scraper.getText(); String txt = scraper.getText();
System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]"); System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]");

@ -6,5 +6,6 @@
<body> <body>
In M&uuml;nchen steht ein Hofbr&auml;uhaus. In M&uuml;nchen steht ein Hofbr&auml;uhaus.
Dort gibt es Bier aus Ma&szlig;kr&uuml;gen.<br> Dort gibt es Bier aus Ma&szlig;kr&uuml;gen.<br>
<a href="http://localhost/umlaute_html_namedentities.html">Example link in HTML with named entities</a>
</body> </body>
</html> </html>

Loading…
Cancel
Save