From 90a7c1affa9d8db341b1373f232934cec54d45b6 Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Mon, 3 Jul 2017 10:00:53 +0200
Subject: [PATCH] HTML parser : removed unnecessary remaining recursive
 processing

Recursive processing was removed in commit
67beef657f82e92f48dd8425073ad81896a2ff4b, but one remained for anchors
content(likely omitted from refactoring). It is no more necessary :
other links such as images embedded in anchors are currently correctly
detected by the parser.

More annoying : that remaining recursive processing could lead to almost
endless processing when encountering some (invalid) HTML structures
involving nested anchors, as detected and reported by lucipher on YaCy
forum ( http://forum.yacy-websuche.de/viewtopic.php?f=23&t=6005 ).
---
 .../document/parser/html/ContentScraper.java  | 60 +++----------------
 .../yacy/document/parser/htmlParserTest.java  | 54 ++++++++++++++++-
 2 files changed, 62 insertions(+), 52 deletions(-)
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 2d655c050..e83190ae9 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -22,7 +22,6 @@ package net.yacy.document.parser.html;
 
 import java.awt.Dimension;
 import java.io.ByteArrayInputStream;
-import java.io.CharArrayReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.Writer;
@@ -78,13 +77,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     private final char[] minuteCharsHTML = "&#039;".toCharArray();
 
     // statics: for initialization of the HTMLFilterAbstractScraper
+    /** Set of tag names processed as singletons (no end tag, or not processing the eventual end tag) */
     private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f);
+    
+    /** Set of tag names processed by pairs of start and end tag */
     private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f);
 
     private static final Pattern LB = Pattern.compile("\n");
 
     public enum TagType {
-        singleton, pair;
+    	/** Tag with no end tag (see https://www.w3.org/TR/html51/syntax.html#void-elements),
+    	 * optional end tag (see https://www.w3.org/TR/html51/syntax.html#optional-tags),
+    	 * or where processing directly only the start tag is desired. */
+        singleton,
+        /** Paired tag : has a start tag and an end tag (https://www.w3.org/TR/html51/syntax.html#normal-elements) */
+        pair;
     }
 
     public enum TagName {
@@ -764,7 +771,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                 tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like  "<a ...> <span>test</span> </a>"
                 tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
                 url.setAll(tag.opts);
-                recursiveParse(url, tag.content.getChars());
                 this.addAnchor(url);
             }
             this.evaluationScores.match(Element.apath, href);
@@ -866,54 +872,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" "));
     }
 
-    private String recursiveParse(final AnchorURL linkurl, final char[] inlineHtml) {
-        if (inlineHtml.length < 14) return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml)));
-
-        // start a new scraper to parse links inside this text
-        // parsing the content
-        final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper, this.timezoneOffset);
-        final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
-        try {
-            FileUtils.copy(new CharArrayReader(inlineHtml), writer);
-        } catch (final IOException e) {
-            ConcurrentLog.logException(e);
-            return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml)));
-        } finally {
-            try {
-                writer.close();
-            } catch (final IOException e) {
-            }
-        }
-        for (final AnchorURL entry: scraper.getAnchors()) {
-            this.addAnchor(entry);
-        }
-        String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
-        StringBuilder altakk = new StringBuilder();
-        for (ImageEntry ie: scraper.images) {
-            if (linkurl != null) {
-                if (ie.alt() != null) altakk.append(ie.alt().trim()).append(' ');
-                linkurl.setImageURL(ie.url());
-                AnchorURL a = new AnchorURL(linkurl);
-                a.setTextProperty(line);
-                a.setImageAlt(ie.alt());
-                a.setImageURL(ie.url());
-                ie.setLinkurl(a);
-            }
-            // this image may have been added recently from the same location (as this is a recursive parse)
-            // we want to keep only one of them, check if they are equal
-            if (this.images.size() > 0 && this.images.get(this.images.size() - 1).url().equals(ie.url())) {
-                this.images.remove(this.images.size() - 1);
-            }
-            this.images.add(ie);
-        }
-        if (linkurl != null) {
-            linkurl.setImageAlt(altakk.toString().trim());
-        }
-
-        scraper.close();
-        return line;
-    }
-
     public List<String> getTitles() {
 
         // some documents have a title tag as meta tag
diff --git a/test/java/net/yacy/document/parser/htmlParserTest.java b/test/java/net/yacy/document/parser/htmlParserTest.java
index beb554a80..20ba4de77 100644
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@@ -1,5 +1,6 @@
 package net.yacy.document.parser;
 
+import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -12,8 +13,12 @@ import java.util.Locale;
 
 import junit.framework.TestCase;
 import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.protocol.HeaderFramework;
+import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
+import net.yacy.document.Parser.Failure;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
@@ -133,6 +138,36 @@ public class htmlParserTest extends TestCase {
         ImageEntry img = scraper.getImages().get(1);
         assertEquals(550,img.width());
     }
+    
+    /**
+     * Test parser resistance against nested anchors pattern 
+     * (<a> tag embedding other <a> tags : invalid HTML, but occasionally encountered in some real-world Internet resources. 
+     * See case reported at http://forum.yacy-websuche.de/viewtopic.php?f=23&t=6005). 
+     * The parser must be able to terminate in a finite time.
+     * @throws IOException when an unexpected error occurred
+     */
+    @Test
+    public void testParseToScraperNestedAnchors() throws IOException {
+        final AnchorURL url = new AnchorURL("http://localhost/");
+        final String charset = StandardCharsets.UTF_8.name();
+        final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
+        /* With prior recursive processing implementation and an average 2017 desktop computer, 
+         * computing time started to be problematic over a nesting depth of 21 */
+        final int nestingDepth = 30;
+        for (int count = 0; count < nestingDepth; count++) {
+        	testHtml.append("<a href=\"http://localhost/doc" + count + ".html\">");
+        }
+        testHtml.append("<img src=\"./img/my_image.png\">");
+        for (int count = 0; count < nestingDepth; count++) {
+        	testHtml.append("</a>");
+        }
+        testHtml.append("</p></body></html>");
+        
+        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), 10);
+        assertEquals(nestingDepth, scraper.getAnchors().size());
+        assertEquals(1, scraper.getImages().size());
+
+    }
 
     /**
      * Test of parseToScraper method, of class htmlParser
@@ -162,7 +197,7 @@ public class htmlParserTest extends TestCase {
      * like "<a " see https://github.com/yacy/yacy_search_server/issues/109
      */
     @Test
-    public void testParteToScraper_ScriptTag() throws MalformedURLException, IOException {
+    public void testParseToScraper_ScriptTag() throws MalformedURLException, IOException {
         final AnchorURL url = new AnchorURL("http://localhost/");
         final String charset = StandardCharsets.UTF_8.name();
         final String textSource = "test text";
@@ -184,4 +219,21 @@ public class htmlParserTest extends TestCase {
         System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]");
         assertEquals(txt, textSource);
     }
+    
+	public static void main(String args[]) throws FileNotFoundException, IOException, Failure, InterruptedException {
+		try (BufferedInputStream sourceStream = new BufferedInputStream(new FileInputStream(new File("/home/luc/dev/documents/endless_loop_htmlparser/test.html")));) {
+			Document[] docs = new htmlParser().parse(
+					new DigestURL("http://www.prawo.vulcan.edu.pl/przegdok.asp?qdatprz=12-09-2016&qplikid=2"),
+					"text/html", HeaderFramework.getCharacterEncoding("text/html"), new VocabularyScraper(), 0,
+					sourceStream);
+			if(docs == null || docs.length == 0) {
+				System.out.println("No result");
+				return;
+			}
+			System.out.println("text : " + docs[0].getTextString());
+			System.out.println("anchors.size : " + docs[0].getAnchors().size());
+		} finally {
+			ConcurrentLog.shutdown();
+		}
+	}
 }