- added support for [[Bookmark:$bookmarkTag|description]]-link-listings (requested by theli) to wiki-parser

- added support for <pre>-tags to wiki-parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3393 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent 619653c054
commit 39a2000d8b

@ -0,0 +1,20 @@
package de.anomic.data.wiki;
public class WikiParserException extends RuntimeException {
private static final long serialVersionUID = 1L;
public WikiParserException() { }
public WikiParserException(String message) {
super(message);
}
public WikiParserException(Throwable cause) {
super(cause);
}
public WikiParserException(String message, Throwable cause) {
super(message, cause);
}
}

@ -53,12 +53,12 @@ public abstract class AbstractToken implements Token {
protected String markup = null;
protected boolean parsed = false;
protected abstract boolean parse();
protected abstract void parse();
public String getMarkup() {
if (this.text == null)
throw new IllegalArgumentException();
if (!this.parsed && !parse()) return this.text;
if (!this.parsed) parse();
return this.markup;
}

@ -47,14 +47,23 @@
package de.anomic.data.wiki.tokens;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.data.bookmarksDB;
import de.anomic.data.bookmarksDB.Bookmark;
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.data.wiki.WikiParserException;
import de.anomic.plasma.plasmaSwitchboard;
public class LinkToken extends AbstractToken {
private static final int IMG = 0;
private static final int INT = 1;
private static final int EXT = 2;
private static final int BKM = 1;
private static final int INT = 2;
private static final int EXT = 3;
private static final Pattern imgPattern = Pattern.compile(
"\\[\\[" + // begin
@ -64,6 +73,12 @@ public class LinkToken extends AbstractToken {
"(\\|(([^\\]]|\\][^\\]])*))" + // description
")?" + // </optional>
"\\]\\]"); // end
private static final Pattern bkmPattern = Pattern.compile(
"\\[\\[" + // begin
"(Bookmark:([^\\]|]|\\][^\\]])*)" + // "Bookmark:" + URL
"(\\|(([^\\]]|\\][^\\]])*?))?" + // optional description
"\\]\\]"); // end
private static final Pattern intPattern = Pattern.compile(
"\\[\\[" + // begin
@ -78,66 +93,114 @@ public class LinkToken extends AbstractToken {
"\\]"); // end
private static final Pattern[] patterns = new Pattern[] {
imgPattern, intPattern, extPattern };
imgPattern, bkmPattern, intPattern, extPattern };
private final String localhost;
private final String wikiPath;
private final plasmaSwitchboard sb;
private int patternNr = 0;
public LinkToken(String localhost, String wikiPath) {
public LinkToken(String localhost, String wikiPath, plasmaSwitchboard sb) {
this.localhost = localhost;
this.wikiPath = wikiPath;
this.sb = sb;
}
protected boolean parse() {
protected void parse() {
StringBuffer sb = new StringBuffer();
Matcher m;
switch (this.patternNr) {
if (this.patternNr < 0 || this.patternNr >= patterns.length)
throw new WikiParserException("patternNr was not set correctly: " + this.patternNr);
Matcher m = patterns[this.patternNr].matcher(this.text);
if (!m.find())
throw new WikiParserException("Didn't find match for: (" + this.patternNr + ") " + this.text);
switch (this.patternNr) {
case IMG:
m = imgPattern.matcher(this.text);
if (!m.find()) return false;
sb.append("<img src=\"").append(formatLink(m.group(1))).append("\"");
sb.append("<img src=\"").append(formatHref(m.group(1).substring(6))).append("\"");
if (m.group(5) != null) sb.append(" align=\"").append(m.group(5)).append("\"");
if (m.group(7) != null) sb.append(" alt=\"").append(m.group(7)).append("\"");
sb.append(" alt=\"").append((m.group(7) == null) ? formatHref(m.group(1).substring(6)) : m.group(7)).append("\"");
sb.append(" />");
break;
case BKM:
Link[] links = getLinksFromBookmarkTag(m.group(2));
if (links == null) {
sb.append("<span class=\"error\">Couldn't find Bookmark-Tag '").append(m.group(2)).append("'.</span>");
} else {
appendLinks(links, sb);
}
break;
case INT:
m = intPattern.matcher(this.text);
if (!m.find()) return false;
sb.append("<a href=\"").append("http://").append(this.localhost)
.append("/").append(this.wikiPath).append(m.group(1))
.append("\"");
if (m.group(4) != null) sb.append(" title=\"").append(m.group(3)).append("\"");
sb.append(">");
if (m.group(4) != null) sb.append(m.group(4)); else sb.append(m.group(1));
sb.append("</a>");
sb.append(new Link(
"http://" + this.localhost + "/" + this.wikiPath + m.group(1),
m.group(4),
(m.group(4) == null) ? m.group(1) : m.group(4)
).toString());
break;
case EXT:
m = extPattern.matcher(this.text);
if (!m.find()) return false;
sb.append("<a href=\"").append(formatLink(m.group(1))).append("\"");
if (m.group(3) != null) sb.append(" title=\"").append(m.group(3)).append("\"");
sb.append(">");
if (m.group(3) != null) sb.append(m.group(3)); else sb.append(m.group(1));
sb.append("</a>");
sb.append(new Link(
m.group(1),
m.group(3),
(m.group(3) == null) ? m.group(1) : m.group(3)
).toString());
break;
default: return false;
}
this.parsed = true;
this.markup = new String(sb);
return true;
}
private String formatLink(String link) {
if (link.indexOf("://") == -1) { // DATA/HTDOCS-link
return "http://" + this.localhost + "/" + link;
} else { // 'normal' link
return link;
}
}
private String formatHref(String link) {
if (link.indexOf("://") == -1) { // DATA/HTDOCS-link
return "http://" + this.localhost + "/share/" + link;
} else { // 'normal' link
return link;
}
}
private StringBuffer appendLinks(Link[] links, StringBuffer sb) {
for (int i=0; i<links.length; i++)
sb.append(links[i].toString());
return sb;
}
private Link[] getLinksFromBookmarkTag(String tagName) {
Tag tag = this.sb.bookmarksDB.getTag(bookmarksDB.tagHash(tagName));
if (tag == null) return null;
ArrayList r = new ArrayList();
Iterator it = tag.getUrlHashes().iterator();
String hash;
Bookmark bm;
while (it.hasNext())
if ((hash = (String)it.next()) != null)
if ((bm = this.sb.bookmarksDB.getBookmark(hash)) != null)
r.add(new Link(bm.getUrl(), bm.getTitle(), bm.getDescription()));
return (Link[])r.toArray(new Link[r.size()]);
}
private static class Link {
private final String href;
private final String title;
private final String desc;
public Link(String href, String title, String desc) {
this.href = href;
this.title = title;
this.desc = desc;
}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append("<a href=\"").append(this.href).append("\"");
if (this.title != null) sb.append(" title=\"").append(this.title).append("\"");
sb.append(">");
if (this.desc == null) sb.append(this.href); else sb.append(this.desc);
sb.append("</a>");
return new String(sb);
}
}
public String[] getBlockElementNames() { return null; }
public Pattern[] getRegex() { return patterns; }

@ -104,12 +104,11 @@ public class ListToken extends AbstractToken {
blockElements = (String[])r.toArray(new String[r.size()]);
}
protected boolean parse() {
protected void parse() {
StringBuffer sb = new StringBuffer(this.text.length());
parse(this.text.split("\n"), 0, sb);
this.markup = new String(sb);
this.parsed = true;
return true;
}
protected StringBuffer parse(String[] t, int depth, StringBuffer sb) {

@ -51,6 +51,8 @@ import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.data.wiki.WikiParserException;
public class SimpleToken extends AbstractToken {
protected String content = null;
@ -94,19 +96,16 @@ public class SimpleToken extends AbstractToken {
setText(this.text, 0);
}
}
if (!this.parsed && !parse()) return this.text;
if (!this.parsed) try { parse(); } catch (WikiParserException e) { return this.text; }
return this.markup;
}
protected boolean parse() {
protected void parse() {
String[] e;
if ((e = definitionList[this.grade]) == null || definitionList.length <= this.grade) {
System.err.println("token not defined for grade: " + this.grade);
return false;
}
if (this.grade >= this.definitionList.length || (e = this.definitionList[this.grade]) == null)
throw new WikiParserException("Token not defined for grade: " + this.grade);
this.markup = getMarkup(e);
this.parsed = true;
return true;
}
protected String getMarkup(String[] es) {

@ -48,7 +48,6 @@
package de.anomic.data.wiki.tokens;
import java.util.HashMap;
import java.util.Iterator;
import java.util.regex.Pattern;
public class TableToken extends AbstractToken {
@ -61,7 +60,7 @@ public class TableToken extends AbstractToken {
};
private static final String[] blockElementNames = new String[] { "table", "tr", "td" };
protected boolean parse() {
protected void parse() {
String[] t = text.split("\n");
String[] tds;
StringBuffer sb = new StringBuffer();
@ -87,7 +86,6 @@ public class TableToken extends AbstractToken {
if (trOpen) sb.append("\t</tr>\n");
this.markup = new String(sb.append("</table>"));
this.parsed = true;
return true;
}
// from de.anomic.data.wikiCode.java.parseTableProperties, modified by [FB]
@ -105,45 +103,38 @@ public class TableToken extends AbstractToken {
* Valid in this case means if they are a property for the table, tr or td
* tag as stated in the HTML Pocket Reference by Jennifer Niederst (1st edition)
* The method is important to avoid XSS attacks on the wiki via table properties.
* @param str A string that may contain several table properties and/or junk.
* @param properties A string that may contain several table properties and/or junk.
* @return A string that only contains table properties.
*/
private static StringBuffer parseTableProperties(final String properties){
String[] values = properties.replaceAll("&quot;", "").split("[= ]"); //splitting the string at = and blanks
StringBuffer sb = new StringBuffer(properties.length());
Iterator it;
String key, valkey, value;
String key, value;
String[] posVals;
int numberofvalues = values.length;
main: for (int i=0; i<numberofvalues; i++) {
valkey = values[i].trim();
for (int i=0; i<numberofvalues; i++) {
key = values[i].trim();
if (i + 1 < numberofvalues) {
value = values[++i].trim();
if (
valkey.equals("summary") ||
(valkey.equals("bgcolor") && value.matches("#{0,1}[0-9a-fA-F]{1,6}|[a-zA-Z]{3,}")) ||
((valkey.equals("width") || valkey.equals("height")) && value.matches("\\d+%{0,1}")) ||
(isInArray(tps, valkey) && value.matches("\\d+"))
(key.equals("summary")) ||
(key.equals("bgcolor") && value.matches("#{0,1}[0-9a-fA-F]{1,6}|[a-zA-Z]{3,}")) ||
((key.equals("width") || key.equals("height")) && value.matches("\\d+%{0,1}")) ||
((posVals = (String[])ps.get(key)) != null && isInArray(posVals, value)) ||
(isInArray(tps, key) && value.matches("\\d+"))
) {
addPair(valkey, value, sb);
addPair(key, value, sb);
continue;
}
it = ps.keySet().iterator();
while (it.hasNext()) {
key = (String)it.next();
if (valkey.equals(key) && isInArray((String[])ps.get(key), (String)value)) {
addPair(valkey, value, sb);
continue main;
}
}
}
if (valkey.equals("nowrap"))
sb.append(" nowrap");
if (key.equals("nowrap"))
addPair("nowrap", "nowrap", sb);
}
return sb;
}
private static StringBuffer addPair(String val1, String val2, StringBuffer sb) {
return sb.append(" ").append(val1).append("=\"").append(val2).append("\"");
private static StringBuffer addPair(String key, String value, StringBuffer sb) {
return sb.append(" ").append(key).append("=\"").append(value).append("\"");
}
private static boolean isInArray(Object[] array, Object find) {

@ -56,43 +56,46 @@ import de.anomic.data.wiki.tokens.ListToken;
import de.anomic.data.wiki.tokens.SimpleToken;
import de.anomic.data.wiki.tokens.TableToken;
import de.anomic.data.wiki.tokens.Token;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.yacy.yacyCore;
public class wikiParser {
public static final Token[] tokens = {
new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true),
new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
new LinkToken("localhost:8080", "Wiki.html?page="),
new ListToken('*', "ul"),
new ListToken('#', "ol"),
new ListToken(':', "blockquote", null),
new ListToken(' ', null, "tt", false),
new DefinitionListToken(),
new TableToken()
};
private static final String[] BEs;
static {
ArrayList r = new ArrayList();
for (int i=0, k, j; i<tokens.length; i++)
if (tokens[i].getBlockElementNames() != null)
for (j=0; j<tokens[i].getBlockElementNames().length; j++) {
if (tokens[i].getBlockElementNames()[j] == null) continue;
if ((k = tokens[i].getBlockElementNames()[j].indexOf(' ')) > 1) {
r.add(tokens[i].getBlockElementNames()[j].substring(0, k));
} else {
r.add(tokens[i].getBlockElementNames()[j]);
}
}
r.add("hr");
BEs = (String[])r.toArray(new String[r.size()]);
}
public final Token[] tokens;
private final String[] BEs;
public wikiParser(plasmaSwitchboard sb) {
tokens = new Token[] {
new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true),
new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
new LinkToken("localhost:8080"/*yacyCore.seedDB.mySeed.getAddress()*/, "Wiki.html?page=", sb),
new ListToken('*', "ul"),
new ListToken('#', "ol"),
new ListToken(':', "blockquote", null),
new ListToken(' ', null, "tt", false),
new DefinitionListToken(),
new TableToken()
};
ArrayList r = new ArrayList();
for (int i=0, k, j; i<tokens.length; i++)
if (tokens[i].getBlockElementNames() != null)
for (j=0; j<tokens[i].getBlockElementNames().length; j++) {
if (tokens[i].getBlockElementNames()[j] == null) continue;
if ((k = tokens[i].getBlockElementNames()[j].indexOf(' ')) > 1) {
r.add(tokens[i].getBlockElementNames()[j].substring(0, k));
} else {
r.add(tokens[i].getBlockElementNames()[j]);
}
}
r.add("hr");
BEs = (String[])r.toArray(new String[r.size()]);
}
public static void main(String[] args) {
String text = "===Title===\n" +
"==blubb[== was ==ein '''shice'''==...och.bla\n" +
String text = "===T<pre>itle===\n" +
"==blubb== was ==ein '''shice'''==...och.bla\n" +
"* ein \n" +
"*==test==\n" +
"*==test=</pre>=\n" +
"** doppelt\n" +
"* ''tess*sst''\n" +
"*** xyz\n" +
@ -118,29 +121,31 @@ public class wikiParser {
":doppel-blubb[= huch =]\n" +
";hier:da\n" +
";dort:und so\n" +
";;und:doppelt";
";;und:doppelt\n\n\n\n" +
"[[Image:blubb|BLA]]";
// text = "[=\n=]* bla";
String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar," +
"[=denk ich=] mal =]";
long l = System.currentTimeMillis();
t = parse((args.length > 0) ? args[0] : text);
t = new wikiParser(null).parse((args.length > 0) ? args[0] : text);
System.out.println("parsing time: " + (System.currentTimeMillis() - l) + " ms");
System.out.println("--- --- ---");
System.out.println(t);
}
// TODO:
// - preParse:
// - <pre>~</pre>
public static String parse(String text) {
public String parse(String text) {
Text[] tt = Text.split2Texts(text, "[=", "=]");
for (int i=0; i<tt.length; i+=2)
tt[i].setText(parseUnescaped(tt[i].getText()));
return replaceBRs(Text.mergeTexts(tt));
text = Text.mergeTexts(tt);
tt = Text.split2Texts(text, "<pre>", "</pre>");
for (int i=0; i<tt.length; i+=2)
tt[i].setText(replaceBRs(tt[i].getText()));
return Text.mergeTexts(tt);
}
public static String parseUnescaped(String text) {
public String parseUnescaped(String text) {
Token st;
Matcher m;
StringBuffer sb;
@ -166,7 +171,7 @@ public class wikiParser {
return text.replaceAll("----", "<hr />");
}
private static String replaceBRs(String text) {
private String replaceBRs(String text) {
StringBuffer sb = new StringBuffer(text.length());
String[] tt = text.split("\n");
boolean replace;
@ -175,8 +180,10 @@ public class wikiParser {
for (j=0; j<BEs.length; j++)
if (tt[i].endsWith(BEs[j] + ">")) { replace = false; break; }
sb.append(tt[i]);
if (replace && i < tt.length - 1) sb.append("<br />");
if (i < tt.length - 1) sb.append("\n");
if (i < tt.length - 1) {
if (replace) sb.append("<br />");
sb.append("\n");
}
}
return new String(sb);
}
@ -193,7 +200,7 @@ public class wikiParser {
this.text = text;
this.escaped = escaped;
this.nl = newLineBefore;
}
}
public String setTextPlain(String text) { return this.text = text; }
public String setText(String text) {
@ -215,22 +222,23 @@ public class wikiParser {
public String toString() { return this.text; }
public boolean isEscaped() { return this.escaped; }
public boolean isNewLineBefore() { return this.nl; }
private static Text[] split2Texts(String text, String escapeBegin, String escapeEnd) {
if (text == null) return null;
if (text.length() < 2) return new Text[] { new Text(text, false, true) };
int startLen = escapeBegin.length();
int endLen = escapeEnd.length();
ArrayList r = new ArrayList();
boolean escaped = text.startsWith(escapeBegin);
if (escaped) r.add(new Text("", false, true));
int i, j = 0;
while ((i = text.indexOf((escaped) ? escapeEnd : escapeBegin, j)) > -1) {
r.add(resolve2Text(text, escaped, (j > 0) ? j + startLen : 0, i, escapeEnd));
r.add(resolve2Text(text, escaped, (j > 0) ? j + ((escaped) ? startLen : endLen) : 0, i, escapeEnd));
j = i;
escaped = !escaped;
}
r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + startLen : 0, -1, escapeEnd));
r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + endLen : 0, -1, escapeEnd));
return (Text[])r.toArray(new Text[r.size()]);
}

Loading…
Cancel
Save