HTML parser : removed unnecessary remaining recursive processing

Recursive processing was removed in commit
67beef657f, but one remained for anchors
content(likely omitted from refactoring). It is no more necessary :
other links such as images embedded in anchors are currently correctly
detected by the parser.

More annoying : that remaining recursive processing could lead to almost
endless processing when encountering some (invalid) HTML structures
involving nested anchors, as detected and reported by lucipher on YaCy
forum ( http://forum.yacy-websuche.de/viewtopic.php?f=23&t=6005 ).
pull/127/head
luccioman 8 years ago
parent e6e20dab52
commit 90a7c1affa

@ -22,7 +22,6 @@ package net.yacy.document.parser.html;
import java.awt.Dimension;
import java.io.ByteArrayInputStream;
import java.io.CharArrayReader;
import java.io.File;
import java.io.IOException;
import java.io.Writer;
@ -78,13 +77,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final char[] minuteCharsHTML = "'".toCharArray();
// statics: for initialization of the HTMLFilterAbstractScraper
/** Set of tag names processed as singletons (no end tag, or not processing the eventual end tag) */
private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f);
/** Set of tag names processed by pairs of start and end tag */
private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f);
private static final Pattern LB = Pattern.compile("\n");
public enum TagType {
singleton, pair;
/** Tag with no end tag (see https://www.w3.org/TR/html51/syntax.html#void-elements),
* optional end tag (see https://www.w3.org/TR/html51/syntax.html#optional-tags),
* or where processing directly only the start tag is desired. */
singleton,
/** Paired tag : has a start tag and an end tag (https://www.w3.org/TR/html51/syntax.html#normal-elements) */
pair;
}
public enum TagName {
@ -764,7 +771,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like "<a ...> <span>test</span> </a>"
tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tag.opts);
recursiveParse(url, tag.content.getChars());
this.addAnchor(url);
}
this.evaluationScores.match(Element.apath, href);
@ -866,54 +872,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" "));
}
private String recursiveParse(final AnchorURL linkurl, final char[] inlineHtml) {
if (inlineHtml.length < 14) return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml)));
// start a new scraper to parse links inside this text
// parsing the content
final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper, this.timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
try {
FileUtils.copy(new CharArrayReader(inlineHtml), writer);
} catch (final IOException e) {
ConcurrentLog.logException(e);
return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml)));
} finally {
try {
writer.close();
} catch (final IOException e) {
}
}
for (final AnchorURL entry: scraper.getAnchors()) {
this.addAnchor(entry);
}
String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
StringBuilder altakk = new StringBuilder();
for (ImageEntry ie: scraper.images) {
if (linkurl != null) {
if (ie.alt() != null) altakk.append(ie.alt().trim()).append(' ');
linkurl.setImageURL(ie.url());
AnchorURL a = new AnchorURL(linkurl);
a.setTextProperty(line);
a.setImageAlt(ie.alt());
a.setImageURL(ie.url());
ie.setLinkurl(a);
}
// this image may have been added recently from the same location (as this is a recursive parse)
// we want to keep only one of them, check if they are equal
if (this.images.size() > 0 && this.images.get(this.images.size() - 1).url().equals(ie.url())) {
this.images.remove(this.images.size() - 1);
}
this.images.add(ie);
}
if (linkurl != null) {
linkurl.setImageAlt(altakk.toString().trim());
}
scraper.close();
return line;
}
public List<String> getTitles() {
// some documents have a title tag as meta tag

@ -1,5 +1,6 @@
package net.yacy.document.parser;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
@ -12,8 +13,12 @@ import java.util.Locale;
import junit.framework.TestCase;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.Parser.Failure;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
@ -133,6 +138,36 @@ public class htmlParserTest extends TestCase {
ImageEntry img = scraper.getImages().get(1);
assertEquals(550,img.width());
}
/**
* Test parser resistance against nested anchors pattern
* (<a> tag embedding other <a> tags : invalid HTML, but occasionally encountered in some real-world Internet resources.
* See case reported at http://forum.yacy-websuche.de/viewtopic.php?f=23&t=6005).
* The parser must be able to terminate in a finite time.
* @throws IOException when an unexpected error occurred
*/
@Test
public void testParseToScraperNestedAnchors() throws IOException {
final AnchorURL url = new AnchorURL("http://localhost/");
final String charset = StandardCharsets.UTF_8.name();
final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><html><body><p>");
/* With prior recursive processing implementation and an average 2017 desktop computer,
* computing time started to be problematic over a nesting depth of 21 */
final int nestingDepth = 30;
for (int count = 0; count < nestingDepth; count++) {
testHtml.append("<a href=\"http://localhost/doc" + count + ".html\">");
}
testHtml.append("<img src=\"./img/my_image.png\">");
for (int count = 0; count < nestingDepth; count++) {
testHtml.append("</a>");
}
testHtml.append("</p></body></html>");
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), 10);
assertEquals(nestingDepth, scraper.getAnchors().size());
assertEquals(1, scraper.getImages().size());
}
/**
* Test of parseToScraper method, of class htmlParser
@ -162,7 +197,7 @@ public class htmlParserTest extends TestCase {
* like "<a " see https://github.com/yacy/yacy_search_server/issues/109
*/
@Test
public void testParteToScraper_ScriptTag() throws MalformedURLException, IOException {
public void testParseToScraper_ScriptTag() throws MalformedURLException, IOException {
final AnchorURL url = new AnchorURL("http://localhost/");
final String charset = StandardCharsets.UTF_8.name();
final String textSource = "test text";
@ -184,4 +219,21 @@ public class htmlParserTest extends TestCase {
System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]");
assertEquals(txt, textSource);
}
public static void main(String args[]) throws FileNotFoundException, IOException, Failure, InterruptedException {
try (BufferedInputStream sourceStream = new BufferedInputStream(new FileInputStream(new File("/home/luc/dev/documents/endless_loop_htmlparser/test.html")));) {
Document[] docs = new htmlParser().parse(
new DigestURL("http://www.prawo.vulcan.edu.pl/przegdok.asp?qdatprz=12-09-2016&qplikid=2"),
"text/html", HeaderFramework.getCharacterEncoding("text/html"), new VocabularyScraper(), 0,
sourceStream);
if(docs == null || docs.length == 0) {
System.out.println("No result");
return;
}
System.out.println("text : " + docs[0].getTextString());
System.out.println("anchors.size : " + docs[0].getAnchors().size());
} finally {
ConcurrentLog.shutdown();
}
}
}

Loading…
Cancel
Save