From 23775e76e2901f2e303e0d4ffea2fb5b216bf97a Mon Sep 17 00:00:00 2001 From: luccioman Date: Wed, 12 Apr 2017 17:17:03 +0200 Subject: [PATCH] Fixed endless loop case in wikicode processing. Detected when importing recent MediaWiki dumps containing some pages with script content in plain text format (see Scribunto extension https://www.mediawiki.org/wiki/Extension:Scribunto ). Further improvement : modify the MediawikiImporter to prevent processing revisions whose is not wikitext. --- source/net/yacy/data/wiki/WikiCode.java | 13 +++-- .../java/net/yacy/data/wiki/WikiCodeTest.java | 47 +++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/source/net/yacy/data/wiki/WikiCode.java b/source/net/yacy/data/wiki/WikiCode.java index 582f61c77..ab4323866 100644 --- a/source/net/yacy/data/wiki/WikiCode.java +++ b/source/net/yacy/data/wiki/WikiCode.java @@ -589,8 +589,9 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { int p; int positionOfOpeningTag; int positionOfClosingTag; + int fromIndex = 0; // internal links and images - while ((positionOfOpeningTag = line.indexOf(WIKI_OPEN_LINK)) >= 0) { + while ((positionOfOpeningTag = line.indexOf(WIKI_OPEN_LINK, fromIndex)) >= 0) { positionOfClosingTag = line.indexOf(WIKI_CLOSE_LINK, positionOfOpeningTag + LEN_WIKI_OPEN_LINK); if (positionOfClosingTag <= positionOfOpeningTag) { break; @@ -640,16 +641,19 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { } line = line.substring(0, positionOfOpeningTag) + "" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_LINK); + fromIndex = positionOfClosingTag + LEN_WIKI_CLOSE_LINK; } // this is the part of the code that is responsible for Youtube video links supporting only the video ID as parameter else if (kl.startsWith(WIKI_VIDEO_YOUTUBE)) { kl = kl.substring(LEN_WIKI_VIDEO_YOUTUBE); line = line.substring(0, positionOfOpeningTag) + "" + ""; + break; } // this is the part of the code that is responsible for Vimeo video links supporting only the video ID as parameter else if (kl.startsWith(WIKI_VIDEO_VIMEO)) { kl = kl.substring(LEN_WIKI_VIDEO_VIMEO); line = line.substring(0, positionOfOpeningTag) + "" + ""; + break; } // if it's no image, it might be an internal link else { @@ -660,11 +664,13 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { kv = kl; } line = line.substring(0, positionOfOpeningTag) + "" + kv + "" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_LINK); // oob exception in append() ! + fromIndex = positionOfClosingTag + LEN_WIKI_CLOSE_LINK; } } - + + fromIndex = 0; // external links - while ((positionOfOpeningTag = line.indexOf(WIKI_OPEN_EXTERNAL_LINK)) >= 0) { + while ((positionOfOpeningTag = line.indexOf(WIKI_OPEN_EXTERNAL_LINK, fromIndex)) >= 0) { positionOfClosingTag = line.indexOf(WIKI_CLOSE_EXTERNAL_LINK, positionOfOpeningTag + LEN_WIKI_OPEN_EXTERNAL_LINK); if (positionOfClosingTag <= positionOfOpeningTag) { break; @@ -686,6 +692,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { kl = "http://" + hostport + "/" + kl; } line = line.substring(0, positionOfOpeningTag) + "" + kv + "" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_EXTERNAL_LINK); + fromIndex = positionOfClosingTag + LEN_WIKI_CLOSE_EXTERNAL_LINK; } return line; } diff --git a/test/java/net/yacy/data/wiki/WikiCodeTest.java b/test/java/net/yacy/data/wiki/WikiCodeTest.java index ef25bf7ee..e7d3c49dd 100644 --- a/test/java/net/yacy/data/wiki/WikiCodeTest.java +++ b/test/java/net/yacy/data/wiki/WikiCodeTest.java @@ -54,4 +54,51 @@ public class WikiCodeTest { assertFalse("no header tag expected:"+erg, erg.contains("

")); } } + + /** + * Test internal link markup processing + */ + @Test + public void testInternalLink() { + WikiCode wc = new WikiCode(); + + /* Link to another wiki article */ + String result = wc.transform("http://wiki:8080", "[[article]]"); + assertTrue(result.contains("renamed article<")); + + /* Multiple links on the same line */ + result = wc.transform("http://wiki:8080", "[[article1]] [[article2]]"); + assertTrue(result.contains("YaCy<")); + + /* Lua Script array parameter : should not crash the transform process */ + result = wc.transform("http://wiki:8080", "'[[[[2,1],[4,3],[6,5],[2,1]],[[12,11],[14,13],[16,15],[12,11]]]]'"); + } }