HTML parser : removed unnecessary remaining recursive processing

Recursive processing was removed in commit 67beef657f, but one remained for anchors content(likely omitted from refactoring). It is no more necessary : other links such as images embedded in anchors are currently correctly detected by the parser. More annoying : that remaining recursive processing could lead to almost endless processing when encountering some (invalid) HTML structures involving nested anchors, as detected and reported by lucipher on YaCy forum ( http://forum.yacy-websuche.de/viewtopic.php?f=23&t=6005 ).
8 years ago · 90a7c1affa
parent e6e20dab52
commit 90a7c1affa
2 changed files with 62 additions and 52 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -22,7 +22,6 @@ package net.yacy.document.parser.html;

 import java.awt.Dimension;
 import java.io.ByteArrayInputStream;
-import java.io.CharArrayReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.Writer;
@ -78,13 +77,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final char[] minuteCharsHTML = "&#039;".toCharArray();

    // statics: for initialization of the HTMLFilterAbstractScraper
+    /** Set of tag names processed as singletons (no end tag, or not processing the eventual end tag) */
    private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f);
+    
+    /** Set of tag names processed by pairs of start and end tag */
    private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f);

    private static final Pattern LB = Pattern.compile("\n");

    public enum TagType {
-        singleton, pair;
+    	/** Tag with no end tag (see https://www.w3.org/TR/html51/syntax.html#void-elements),
+    	 * optional end tag (see https://www.w3.org/TR/html51/syntax.html#optional-tags),
+    	 * or where processing directly only the start tag is desired. */
+        singleton,
+        /** Paired tag : has a start tag and an end tag (https://www.w3.org/TR/html51/syntax.html#normal-elements) */
+        pair;
    }

    public enum TagName {
@ -764,7 +771,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like  "<a ...> <span>test</span> </a>"
                tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
                url.setAll(tag.opts);
-                recursiveParse(url, tag.content.getChars());
                this.addAnchor(url);
            }
            this.evaluationScores.match(Element.apath, href);
@ -866,54 +872,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" "));
    }

-    private String recursiveParse(final AnchorURL linkurl, final char[] inlineHtml) {
-        if (inlineHtml.length < 14) return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml)));
-
-        // start a new scraper to parse links inside this text
-        // parsing the content
-        final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper, this.timezoneOffset);
-        final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
-        try {
-            FileUtils.copy(new CharArrayReader(inlineHtml), writer);
-        } catch (final IOException e) {
-            ConcurrentLog.logException(e);
-            return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml)));
-        } finally {
-            try {
-                writer.close();
-            } catch (final IOException e) {
-            }
-        }
-        for (final AnchorURL entry: scraper.getAnchors()) {
-            this.addAnchor(entry);
-        }
-        String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
-        StringBuilder altakk = new StringBuilder();
-        for (ImageEntry ie: scraper.images) {
-            if (linkurl != null) {
-                if (ie.alt() != null) altakk.append(ie.alt().trim()).append(' ');
-                linkurl.setImageURL(ie.url());
-                AnchorURL a = new AnchorURL(linkurl);
-                a.setTextProperty(line);
-                a.setImageAlt(ie.alt());
-                a.setImageURL(ie.url());
-                ie.setLinkurl(a);
-            }
-            // this image may have been added recently from the same location (as this is a recursive parse)
-            // we want to keep only one of them, check if they are equal
-            if (this.images.size() > 0 && this.images.get(this.images.size() - 1).url().equals(ie.url())) {
-                this.images.remove(this.images.size() - 1);
-            }
-            this.images.add(ie);
-        }
-        if (linkurl != null) {
-            linkurl.setImageAlt(altakk.toString().trim());
-        }
-
-        scraper.close();
-        return line;
-    }
-
    public List<String> getTitles() {

        // some documents have a title tag as meta tag
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@ -1,5 +1,6 @@
 package net.yacy.document.parser;

+import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@ -12,8 +13,12 @@ import java.util.Locale;

 import junit.framework.TestCase;
 import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.protocol.HeaderFramework;
+import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
+import net.yacy.document.Parser.Failure;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
@ -133,6 +138,36 @@ public class htmlParserTest extends TestCase {
        ImageEntry img = scraper.getImages().get(1);
        assertEquals(550,img.width());
    }
+    
+    /**
+     * Test parser resistance against nested anchors pattern 
+     * (<a> tag embedding other <a> tags : invalid HTML, but occasionally encountered in some real-world Internet resources. 
+     * See case reported at http://forum.yacy-websuche.de/viewtopic.php?f=23&t=6005). 
+     * The parser must be able to terminate in a finite time.
+     * @throws IOException when an unexpected error occurred
+     */
+    @Test
+    public void testParseToScraperNestedAnchors() throws IOException {
+        final AnchorURL url = new AnchorURL("http://localhost/");
+        final String charset = StandardCharsets.UTF_8.name();
+        final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
+        /* With prior recursive processing implementation and an average 2017 desktop computer, 
+         * computing time started to be problematic over a nesting depth of 21 */
+        final int nestingDepth = 30;
+        for (int count = 0; count < nestingDepth; count++) {
+        	testHtml.append("<a href=\"http://localhost/doc" + count + ".html\">");
+        }
+        testHtml.append("<img src=\"./img/my_image.png\">");
+        for (int count = 0; count < nestingDepth; count++) {
+        	testHtml.append("</a>");
+        }
+        testHtml.append("</p></body></html>");
+        
+        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), 10);
+        assertEquals(nestingDepth, scraper.getAnchors().size());
+        assertEquals(1, scraper.getImages().size());
+
+    }

    /**
     * Test of parseToScraper method, of class htmlParser
@ -162,7 +197,7 @@ public class htmlParserTest extends TestCase {
     * like "<a " see https://github.com/yacy/yacy_search_server/issues/109
     */
    @Test
-    public void testParteToScraper_ScriptTag() throws MalformedURLException, IOException {
+    public void testParseToScraper_ScriptTag() throws MalformedURLException, IOException {
        final AnchorURL url = new AnchorURL("http://localhost/");
        final String charset = StandardCharsets.UTF_8.name();
        final String textSource = "test text";
@ -184,4 +219,21 @@ public class htmlParserTest extends TestCase {
        System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]");
        assertEquals(txt, textSource);
    }
+    
+	public static void main(String args[]) throws FileNotFoundException, IOException, Failure, InterruptedException {
+		try (BufferedInputStream sourceStream = new BufferedInputStream(new FileInputStream(new File("/home/luc/dev/documents/endless_loop_htmlparser/test.html")));) {
+			Document[] docs = new htmlParser().parse(
+					new DigestURL("http://www.prawo.vulcan.edu.pl/przegdok.asp?qdatprz=12-09-2016&qplikid=2"),
+					"text/html", HeaderFramework.getCharacterEncoding("text/html"), new VocabularyScraper(), 0,
+					sourceStream);
+			if(docs == null || docs.length == 0) {
+				System.out.println("No result");
+				return;
+			}
+			System.out.println("text : " + docs[0].getTextString());
+			System.out.println("anchors.size : " + docs[0].getAnchors().size());
+		} finally {
+			ConcurrentLog.shutdown();
+		}
+	}
 }