From 90a7c1affa9d8db341b1373f232934cec54d45b6 Mon Sep 17 00:00:00 2001 From: luccioman Date: Mon, 3 Jul 2017 10:00:53 +0200 Subject: [PATCH] HTML parser : removed unnecessary remaining recursive processing Recursive processing was removed in commit 67beef657f82e92f48dd8425073ad81896a2ff4b, but one remained for anchors content(likely omitted from refactoring). It is no more necessary : other links such as images embedded in anchors are currently correctly detected by the parser. More annoying : that remaining recursive processing could lead to almost endless processing when encountering some (invalid) HTML structures involving nested anchors, as detected and reported by lucipher on YaCy forum ( http://forum.yacy-websuche.de/viewtopic.php?f=23&t=6005 ). --- .../document/parser/html/ContentScraper.java | 60 +++---------------- .../yacy/document/parser/htmlParserTest.java | 54 ++++++++++++++++- 2 files changed, 62 insertions(+), 52 deletions(-) diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 2d655c050..e83190ae9 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -22,7 +22,6 @@ package net.yacy.document.parser.html; import java.awt.Dimension; import java.io.ByteArrayInputStream; -import java.io.CharArrayReader; import java.io.File; import java.io.IOException; import java.io.Writer; @@ -78,13 +77,21 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final char[] minuteCharsHTML = "'".toCharArray(); // statics: for initialization of the HTMLFilterAbstractScraper + /** Set of tag names processed as singletons (no end tag, or not processing the eventual end tag) */ private static final Set linkTags0 = new HashSet(12,0.99f); + + /** Set of tag names processed by pairs of start and end tag */ private static final Set linkTags1 = new HashSet(15,0.99f); private static final Pattern LB = Pattern.compile("\n"); public enum TagType { - singleton, pair; + /** Tag with no end tag (see https://www.w3.org/TR/html51/syntax.html#void-elements), + * optional end tag (see https://www.w3.org/TR/html51/syntax.html#optional-tags), + * or where processing directly only the start tag is desired. */ + singleton, + /** Paired tag : has a start tag and an end tag (https://www.w3.org/TR/html51/syntax.html#normal-elements) */ + pair; } public enum TagName { @@ -764,7 +771,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like " test " tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute url.setAll(tag.opts); - recursiveParse(url, tag.content.getChars()); this.addAnchor(url); } this.evaluationScores.match(Element.apath, href); @@ -866,54 +872,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" ")); } - private String recursiveParse(final AnchorURL linkurl, final char[] inlineHtml) { - if (inlineHtml.length < 14) return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml))); - - // start a new scraper to parse links inside this text - // parsing the content - final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper, this.timezoneOffset); - final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false); - try { - FileUtils.copy(new CharArrayReader(inlineHtml), writer); - } catch (final IOException e) { - ConcurrentLog.logException(e); - return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml))); - } finally { - try { - writer.close(); - } catch (final IOException e) { - } - } - for (final AnchorURL entry: scraper.getAnchors()) { - this.addAnchor(entry); - } - String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars()))); - StringBuilder altakk = new StringBuilder(); - for (ImageEntry ie: scraper.images) { - if (linkurl != null) { - if (ie.alt() != null) altakk.append(ie.alt().trim()).append(' '); - linkurl.setImageURL(ie.url()); - AnchorURL a = new AnchorURL(linkurl); - a.setTextProperty(line); - a.setImageAlt(ie.alt()); - a.setImageURL(ie.url()); - ie.setLinkurl(a); - } - // this image may have been added recently from the same location (as this is a recursive parse) - // we want to keep only one of them, check if they are equal - if (this.images.size() > 0 && this.images.get(this.images.size() - 1).url().equals(ie.url())) { - this.images.remove(this.images.size() - 1); - } - this.images.add(ie); - } - if (linkurl != null) { - linkurl.setImageAlt(altakk.toString().trim()); - } - - scraper.close(); - return line; - } - public List getTitles() { // some documents have a title tag as meta tag diff --git a/test/java/net/yacy/document/parser/htmlParserTest.java b/test/java/net/yacy/document/parser/htmlParserTest.java index beb554a80..20ba4de77 100644 --- a/test/java/net/yacy/document/parser/htmlParserTest.java +++ b/test/java/net/yacy/document/parser/htmlParserTest.java @@ -1,5 +1,6 @@ package net.yacy.document.parser; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -12,8 +13,12 @@ import java.util.Locale; import junit.framework.TestCase; import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.protocol.HeaderFramework; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.Parser.Failure; import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; @@ -133,6 +138,36 @@ public class htmlParserTest extends TestCase { ImageEntry img = scraper.getImages().get(1); assertEquals(550,img.width()); } + + /** + * Test parser resistance against nested anchors pattern + * ( tag embedding other tags : invalid HTML, but occasionally encountered in some real-world Internet resources. + * See case reported at http://forum.yacy-websuche.de/viewtopic.php?f=23&t=6005). + * The parser must be able to terminate in a finite time. + * @throws IOException when an unexpected error occurred + */ + @Test + public void testParseToScraperNestedAnchors() throws IOException { + final AnchorURL url = new AnchorURL("http://localhost/"); + final String charset = StandardCharsets.UTF_8.name(); + final StringBuilder testHtml = new StringBuilder("

"); + /* With prior recursive processing implementation and an average 2017 desktop computer, + * computing time started to be problematic over a nesting depth of 21 */ + final int nestingDepth = 30; + for (int count = 0; count < nestingDepth; count++) { + testHtml.append(""); + } + testHtml.append(""); + for (int count = 0; count < nestingDepth; count++) { + testHtml.append(""); + } + testHtml.append("

"); + + ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), 10); + assertEquals(nestingDepth, scraper.getAnchors().size()); + assertEquals(1, scraper.getImages().size()); + + } /** * Test of parseToScraper method, of class htmlParser @@ -162,7 +197,7 @@ public class htmlParserTest extends TestCase { * like "