- added support for [[Bookmark:$bookmarkTag|description]]-link-listings (requested by theli) to wiki-parser

- added support for <pre>-tags to wiki-parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3393 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent 619653c054
commit 39a2000d8b

@ -0,0 +1,20 @@
package de.anomic.data.wiki;
public class WikiParserException extends RuntimeException {
private static final long serialVersionUID = 1L;
public WikiParserException() { }
public WikiParserException(String message) {
super(message);
}
public WikiParserException(Throwable cause) {
super(cause);
}
public WikiParserException(String message, Throwable cause) {
super(message, cause);
}
}

@ -53,12 +53,12 @@ public abstract class AbstractToken implements Token {
protected String markup = null; protected String markup = null;
protected boolean parsed = false; protected boolean parsed = false;
protected abstract boolean parse(); protected abstract void parse();
public String getMarkup() { public String getMarkup() {
if (this.text == null) if (this.text == null)
throw new IllegalArgumentException(); throw new IllegalArgumentException();
if (!this.parsed && !parse()) return this.text; if (!this.parsed) parse();
return this.markup; return this.markup;
} }

@ -47,14 +47,23 @@
package de.anomic.data.wiki.tokens; package de.anomic.data.wiki.tokens;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import de.anomic.data.bookmarksDB;
import de.anomic.data.bookmarksDB.Bookmark;
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.data.wiki.WikiParserException;
import de.anomic.plasma.plasmaSwitchboard;
public class LinkToken extends AbstractToken { public class LinkToken extends AbstractToken {
private static final int IMG = 0; private static final int IMG = 0;
private static final int INT = 1; private static final int BKM = 1;
private static final int EXT = 2; private static final int INT = 2;
private static final int EXT = 3;
private static final Pattern imgPattern = Pattern.compile( private static final Pattern imgPattern = Pattern.compile(
"\\[\\[" + // begin "\\[\\[" + // begin
@ -64,6 +73,12 @@ public class LinkToken extends AbstractToken {
"(\\|(([^\\]]|\\][^\\]])*))" + // description "(\\|(([^\\]]|\\][^\\]])*))" + // description
")?" + // </optional> ")?" + // </optional>
"\\]\\]"); // end "\\]\\]"); // end
private static final Pattern bkmPattern = Pattern.compile(
"\\[\\[" + // begin
"(Bookmark:([^\\]|]|\\][^\\]])*)" + // "Bookmark:" + URL
"(\\|(([^\\]]|\\][^\\]])*?))?" + // optional description
"\\]\\]"); // end
private static final Pattern intPattern = Pattern.compile( private static final Pattern intPattern = Pattern.compile(
"\\[\\[" + // begin "\\[\\[" + // begin
@ -78,66 +93,114 @@ public class LinkToken extends AbstractToken {
"\\]"); // end "\\]"); // end
private static final Pattern[] patterns = new Pattern[] { private static final Pattern[] patterns = new Pattern[] {
imgPattern, intPattern, extPattern }; imgPattern, bkmPattern, intPattern, extPattern };
private final String localhost; private final String localhost;
private final String wikiPath; private final String wikiPath;
private final plasmaSwitchboard sb;
private int patternNr = 0; private int patternNr = 0;
public LinkToken(String localhost, String wikiPath) { public LinkToken(String localhost, String wikiPath, plasmaSwitchboard sb) {
this.localhost = localhost; this.localhost = localhost;
this.wikiPath = wikiPath; this.wikiPath = wikiPath;
this.sb = sb;
} }
protected boolean parse() { protected void parse() {
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
Matcher m; if (this.patternNr < 0 || this.patternNr >= patterns.length)
switch (this.patternNr) { throw new WikiParserException("patternNr was not set correctly: " + this.patternNr);
Matcher m = patterns[this.patternNr].matcher(this.text);
if (!m.find())
throw new WikiParserException("Didn't find match for: (" + this.patternNr + ") " + this.text);
switch (this.patternNr) {
case IMG: case IMG:
m = imgPattern.matcher(this.text); sb.append("<img src=\"").append(formatHref(m.group(1).substring(6))).append("\"");
if (!m.find()) return false;
sb.append("<img src=\"").append(formatLink(m.group(1))).append("\"");
if (m.group(5) != null) sb.append(" align=\"").append(m.group(5)).append("\""); if (m.group(5) != null) sb.append(" align=\"").append(m.group(5)).append("\"");
if (m.group(7) != null) sb.append(" alt=\"").append(m.group(7)).append("\""); sb.append(" alt=\"").append((m.group(7) == null) ? formatHref(m.group(1).substring(6)) : m.group(7)).append("\"");
sb.append(" />"); sb.append(" />");
break; break;
case BKM:
Link[] links = getLinksFromBookmarkTag(m.group(2));
if (links == null) {
sb.append("<span class=\"error\">Couldn't find Bookmark-Tag '").append(m.group(2)).append("'.</span>");
} else {
appendLinks(links, sb);
}
break;
case INT: case INT:
m = intPattern.matcher(this.text); sb.append(new Link(
if (!m.find()) return false; "http://" + this.localhost + "/" + this.wikiPath + m.group(1),
sb.append("<a href=\"").append("http://").append(this.localhost) m.group(4),
.append("/").append(this.wikiPath).append(m.group(1)) (m.group(4) == null) ? m.group(1) : m.group(4)
.append("\""); ).toString());
if (m.group(4) != null) sb.append(" title=\"").append(m.group(3)).append("\"");
sb.append(">");
if (m.group(4) != null) sb.append(m.group(4)); else sb.append(m.group(1));
sb.append("</a>");
break; break;
case EXT: case EXT:
m = extPattern.matcher(this.text); sb.append(new Link(
if (!m.find()) return false; m.group(1),
sb.append("<a href=\"").append(formatLink(m.group(1))).append("\""); m.group(3),
if (m.group(3) != null) sb.append(" title=\"").append(m.group(3)).append("\""); (m.group(3) == null) ? m.group(1) : m.group(3)
sb.append(">"); ).toString());
if (m.group(3) != null) sb.append(m.group(3)); else sb.append(m.group(1));
sb.append("</a>");
break; break;
default: return false;
} }
this.parsed = true; this.parsed = true;
this.markup = new String(sb); this.markup = new String(sb);
return true;
}
private String formatLink(String link) {
if (link.indexOf("://") == -1) { // DATA/HTDOCS-link
return "http://" + this.localhost + "/" + link;
} else { // 'normal' link
return link;
}
} }
private String formatHref(String link) {
if (link.indexOf("://") == -1) { // DATA/HTDOCS-link
return "http://" + this.localhost + "/share/" + link;
} else { // 'normal' link
return link;
}
}
private StringBuffer appendLinks(Link[] links, StringBuffer sb) {
for (int i=0; i<links.length; i++)
sb.append(links[i].toString());
return sb;
}
private Link[] getLinksFromBookmarkTag(String tagName) {
Tag tag = this.sb.bookmarksDB.getTag(bookmarksDB.tagHash(tagName));
if (tag == null) return null;
ArrayList r = new ArrayList();
Iterator it = tag.getUrlHashes().iterator();
String hash;
Bookmark bm;
while (it.hasNext())
if ((hash = (String)it.next()) != null)
if ((bm = this.sb.bookmarksDB.getBookmark(hash)) != null)
r.add(new Link(bm.getUrl(), bm.getTitle(), bm.getDescription()));
return (Link[])r.toArray(new Link[r.size()]);
}
private static class Link {
private final String href;
private final String title;
private final String desc;
public Link(String href, String title, String desc) {
this.href = href;
this.title = title;
this.desc = desc;
}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append("<a href=\"").append(this.href).append("\"");
if (this.title != null) sb.append(" title=\"").append(this.title).append("\"");
sb.append(">");
if (this.desc == null) sb.append(this.href); else sb.append(this.desc);
sb.append("</a>");
return new String(sb);
}
}
public String[] getBlockElementNames() { return null; } public String[] getBlockElementNames() { return null; }
public Pattern[] getRegex() { return patterns; } public Pattern[] getRegex() { return patterns; }

@ -104,12 +104,11 @@ public class ListToken extends AbstractToken {
blockElements = (String[])r.toArray(new String[r.size()]); blockElements = (String[])r.toArray(new String[r.size()]);
} }
protected boolean parse() { protected void parse() {
StringBuffer sb = new StringBuffer(this.text.length()); StringBuffer sb = new StringBuffer(this.text.length());
parse(this.text.split("\n"), 0, sb); parse(this.text.split("\n"), 0, sb);
this.markup = new String(sb); this.markup = new String(sb);
this.parsed = true; this.parsed = true;
return true;
} }
protected StringBuffer parse(String[] t, int depth, StringBuffer sb) { protected StringBuffer parse(String[] t, int depth, StringBuffer sb) {

@ -51,6 +51,8 @@ import java.util.ArrayList;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import de.anomic.data.wiki.WikiParserException;
public class SimpleToken extends AbstractToken { public class SimpleToken extends AbstractToken {
protected String content = null; protected String content = null;
@ -94,19 +96,16 @@ public class SimpleToken extends AbstractToken {
setText(this.text, 0); setText(this.text, 0);
} }
} }
if (!this.parsed && !parse()) return this.text; if (!this.parsed) try { parse(); } catch (WikiParserException e) { return this.text; }
return this.markup; return this.markup;
} }
protected boolean parse() { protected void parse() {
String[] e; String[] e;
if ((e = definitionList[this.grade]) == null || definitionList.length <= this.grade) { if (this.grade >= this.definitionList.length || (e = this.definitionList[this.grade]) == null)
System.err.println("token not defined for grade: " + this.grade); throw new WikiParserException("Token not defined for grade: " + this.grade);
return false;
}
this.markup = getMarkup(e); this.markup = getMarkup(e);
this.parsed = true; this.parsed = true;
return true;
} }
protected String getMarkup(String[] es) { protected String getMarkup(String[] es) {

@ -48,7 +48,6 @@
package de.anomic.data.wiki.tokens; package de.anomic.data.wiki.tokens;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator;
import java.util.regex.Pattern; import java.util.regex.Pattern;
public class TableToken extends AbstractToken { public class TableToken extends AbstractToken {
@ -61,7 +60,7 @@ public class TableToken extends AbstractToken {
}; };
private static final String[] blockElementNames = new String[] { "table", "tr", "td" }; private static final String[] blockElementNames = new String[] { "table", "tr", "td" };
protected boolean parse() { protected void parse() {
String[] t = text.split("\n"); String[] t = text.split("\n");
String[] tds; String[] tds;
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
@ -87,7 +86,6 @@ public class TableToken extends AbstractToken {
if (trOpen) sb.append("\t</tr>\n"); if (trOpen) sb.append("\t</tr>\n");
this.markup = new String(sb.append("</table>")); this.markup = new String(sb.append("</table>"));
this.parsed = true; this.parsed = true;
return true;
} }
// from de.anomic.data.wikiCode.java.parseTableProperties, modified by [FB] // from de.anomic.data.wikiCode.java.parseTableProperties, modified by [FB]
@ -105,45 +103,38 @@ public class TableToken extends AbstractToken {
* Valid in this case means if they are a property for the table, tr or td * Valid in this case means if they are a property for the table, tr or td
* tag as stated in the HTML Pocket Reference by Jennifer Niederst (1st edition) * tag as stated in the HTML Pocket Reference by Jennifer Niederst (1st edition)
* The method is important to avoid XSS attacks on the wiki via table properties. * The method is important to avoid XSS attacks on the wiki via table properties.
* @param str A string that may contain several table properties and/or junk. * @param properties A string that may contain several table properties and/or junk.
* @return A string that only contains table properties. * @return A string that only contains table properties.
*/ */
private static StringBuffer parseTableProperties(final String properties){ private static StringBuffer parseTableProperties(final String properties){
String[] values = properties.replaceAll("&quot;", "").split("[= ]"); //splitting the string at = and blanks String[] values = properties.replaceAll("&quot;", "").split("[= ]"); //splitting the string at = and blanks
StringBuffer sb = new StringBuffer(properties.length()); StringBuffer sb = new StringBuffer(properties.length());
Iterator it; String key, value;
String key, valkey, value; String[] posVals;
int numberofvalues = values.length; int numberofvalues = values.length;
main: for (int i=0; i<numberofvalues; i++) { for (int i=0; i<numberofvalues; i++) {
valkey = values[i].trim(); key = values[i].trim();
if (i + 1 < numberofvalues) { if (i + 1 < numberofvalues) {
value = values[++i].trim(); value = values[++i].trim();
if ( if (
valkey.equals("summary") || (key.equals("summary")) ||
(valkey.equals("bgcolor") && value.matches("#{0,1}[0-9a-fA-F]{1,6}|[a-zA-Z]{3,}")) || (key.equals("bgcolor") && value.matches("#{0,1}[0-9a-fA-F]{1,6}|[a-zA-Z]{3,}")) ||
((valkey.equals("width") || valkey.equals("height")) && value.matches("\\d+%{0,1}")) || ((key.equals("width") || key.equals("height")) && value.matches("\\d+%{0,1}")) ||
(isInArray(tps, valkey) && value.matches("\\d+")) ((posVals = (String[])ps.get(key)) != null && isInArray(posVals, value)) ||
(isInArray(tps, key) && value.matches("\\d+"))
) { ) {
addPair(valkey, value, sb); addPair(key, value, sb);
continue; continue;
} }
it = ps.keySet().iterator();
while (it.hasNext()) {
key = (String)it.next();
if (valkey.equals(key) && isInArray((String[])ps.get(key), (String)value)) {
addPair(valkey, value, sb);
continue main;
}
}
} }
if (valkey.equals("nowrap")) if (key.equals("nowrap"))
sb.append(" nowrap"); addPair("nowrap", "nowrap", sb);
} }
return sb; return sb;
} }
private static StringBuffer addPair(String val1, String val2, StringBuffer sb) { private static StringBuffer addPair(String key, String value, StringBuffer sb) {
return sb.append(" ").append(val1).append("=\"").append(val2).append("\""); return sb.append(" ").append(key).append("=\"").append(value).append("\"");
} }
private static boolean isInArray(Object[] array, Object find) { private static boolean isInArray(Object[] array, Object find) {

@ -56,43 +56,46 @@ import de.anomic.data.wiki.tokens.ListToken;
import de.anomic.data.wiki.tokens.SimpleToken; import de.anomic.data.wiki.tokens.SimpleToken;
import de.anomic.data.wiki.tokens.TableToken; import de.anomic.data.wiki.tokens.TableToken;
import de.anomic.data.wiki.tokens.Token; import de.anomic.data.wiki.tokens.Token;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.yacy.yacyCore;
public class wikiParser { public class wikiParser {
public static final Token[] tokens = { public final Token[] tokens;
new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true), private final String[] BEs;
new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
new LinkToken("localhost:8080", "Wiki.html?page="), public wikiParser(plasmaSwitchboard sb) {
new ListToken('*', "ul"), tokens = new Token[] {
new ListToken('#', "ol"), new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true),
new ListToken(':', "blockquote", null), new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
new ListToken(' ', null, "tt", false), new LinkToken("localhost:8080"/*yacyCore.seedDB.mySeed.getAddress()*/, "Wiki.html?page=", sb),
new DefinitionListToken(), new ListToken('*', "ul"),
new TableToken() new ListToken('#', "ol"),
}; new ListToken(':', "blockquote", null),
new ListToken(' ', null, "tt", false),
private static final String[] BEs; new DefinitionListToken(),
static { new TableToken()
ArrayList r = new ArrayList(); };
for (int i=0, k, j; i<tokens.length; i++) ArrayList r = new ArrayList();
if (tokens[i].getBlockElementNames() != null) for (int i=0, k, j; i<tokens.length; i++)
for (j=0; j<tokens[i].getBlockElementNames().length; j++) { if (tokens[i].getBlockElementNames() != null)
if (tokens[i].getBlockElementNames()[j] == null) continue; for (j=0; j<tokens[i].getBlockElementNames().length; j++) {
if ((k = tokens[i].getBlockElementNames()[j].indexOf(' ')) > 1) { if (tokens[i].getBlockElementNames()[j] == null) continue;
r.add(tokens[i].getBlockElementNames()[j].substring(0, k)); if ((k = tokens[i].getBlockElementNames()[j].indexOf(' ')) > 1) {
} else { r.add(tokens[i].getBlockElementNames()[j].substring(0, k));
r.add(tokens[i].getBlockElementNames()[j]); } else {
} r.add(tokens[i].getBlockElementNames()[j]);
} }
r.add("hr"); }
BEs = (String[])r.toArray(new String[r.size()]); r.add("hr");
} BEs = (String[])r.toArray(new String[r.size()]);
}
public static void main(String[] args) { public static void main(String[] args) {
String text = "===Title===\n" + String text = "===T<pre>itle===\n" +
"==blubb[== was ==ein '''shice'''==...och.bla\n" + "==blubb== was ==ein '''shice'''==...och.bla\n" +
"* ein \n" + "* ein \n" +
"*==test==\n" + "*==test=</pre>=\n" +
"** doppelt\n" + "** doppelt\n" +
"* ''tess*sst''\n" + "* ''tess*sst''\n" +
"*** xyz\n" + "*** xyz\n" +
@ -118,29 +121,31 @@ public class wikiParser {
":doppel-blubb[= huch =]\n" + ":doppel-blubb[= huch =]\n" +
";hier:da\n" + ";hier:da\n" +
";dort:und so\n" + ";dort:und so\n" +
";;und:doppelt"; ";;und:doppelt\n\n\n\n" +
"[[Image:blubb|BLA]]";
// text = "[=\n=]* bla"; // text = "[=\n=]* bla";
String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar," + String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar," +
"[=denk ich=] mal =]"; "[=denk ich=] mal =]";
long l = System.currentTimeMillis(); long l = System.currentTimeMillis();
t = parse((args.length > 0) ? args[0] : text); t = new wikiParser(null).parse((args.length > 0) ? args[0] : text);
System.out.println("parsing time: " + (System.currentTimeMillis() - l) + " ms"); System.out.println("parsing time: " + (System.currentTimeMillis() - l) + " ms");
System.out.println("--- --- ---"); System.out.println("--- --- ---");
System.out.println(t); System.out.println(t);
} }
// TODO: public String parse(String text) {
// - preParse:
// - <pre>~</pre>
public static String parse(String text) {
Text[] tt = Text.split2Texts(text, "[=", "=]"); Text[] tt = Text.split2Texts(text, "[=", "=]");
for (int i=0; i<tt.length; i+=2) for (int i=0; i<tt.length; i+=2)
tt[i].setText(parseUnescaped(tt[i].getText())); tt[i].setText(parseUnescaped(tt[i].getText()));
return replaceBRs(Text.mergeTexts(tt)); text = Text.mergeTexts(tt);
tt = Text.split2Texts(text, "<pre>", "</pre>");
for (int i=0; i<tt.length; i+=2)
tt[i].setText(replaceBRs(tt[i].getText()));
return Text.mergeTexts(tt);
} }
public static String parseUnescaped(String text) { public String parseUnescaped(String text) {
Token st; Token st;
Matcher m; Matcher m;
StringBuffer sb; StringBuffer sb;
@ -166,7 +171,7 @@ public class wikiParser {
return text.replaceAll("----", "<hr />"); return text.replaceAll("----", "<hr />");
} }
private static String replaceBRs(String text) { private String replaceBRs(String text) {
StringBuffer sb = new StringBuffer(text.length()); StringBuffer sb = new StringBuffer(text.length());
String[] tt = text.split("\n"); String[] tt = text.split("\n");
boolean replace; boolean replace;
@ -175,8 +180,10 @@ public class wikiParser {
for (j=0; j<BEs.length; j++) for (j=0; j<BEs.length; j++)
if (tt[i].endsWith(BEs[j] + ">")) { replace = false; break; } if (tt[i].endsWith(BEs[j] + ">")) { replace = false; break; }
sb.append(tt[i]); sb.append(tt[i]);
if (replace && i < tt.length - 1) sb.append("<br />"); if (i < tt.length - 1) {
if (i < tt.length - 1) sb.append("\n"); if (replace) sb.append("<br />");
sb.append("\n");
}
} }
return new String(sb); return new String(sb);
} }
@ -193,7 +200,7 @@ public class wikiParser {
this.text = text; this.text = text;
this.escaped = escaped; this.escaped = escaped;
this.nl = newLineBefore; this.nl = newLineBefore;
} }
public String setTextPlain(String text) { return this.text = text; } public String setTextPlain(String text) { return this.text = text; }
public String setText(String text) { public String setText(String text) {
@ -215,22 +222,23 @@ public class wikiParser {
public String toString() { return this.text; } public String toString() { return this.text; }
public boolean isEscaped() { return this.escaped; } public boolean isEscaped() { return this.escaped; }
public boolean isNewLineBefore() { return this.nl; } public boolean isNewLineBefore() { return this.nl; }
private static Text[] split2Texts(String text, String escapeBegin, String escapeEnd) { private static Text[] split2Texts(String text, String escapeBegin, String escapeEnd) {
if (text == null) return null; if (text == null) return null;
if (text.length() < 2) return new Text[] { new Text(text, false, true) }; if (text.length() < 2) return new Text[] { new Text(text, false, true) };
int startLen = escapeBegin.length(); int startLen = escapeBegin.length();
int endLen = escapeEnd.length();
ArrayList r = new ArrayList(); ArrayList r = new ArrayList();
boolean escaped = text.startsWith(escapeBegin); boolean escaped = text.startsWith(escapeBegin);
if (escaped) r.add(new Text("", false, true)); if (escaped) r.add(new Text("", false, true));
int i, j = 0; int i, j = 0;
while ((i = text.indexOf((escaped) ? escapeEnd : escapeBegin, j)) > -1) { while ((i = text.indexOf((escaped) ? escapeEnd : escapeBegin, j)) > -1) {
r.add(resolve2Text(text, escaped, (j > 0) ? j + startLen : 0, i, escapeEnd)); r.add(resolve2Text(text, escaped, (j > 0) ? j + ((escaped) ? startLen : endLen) : 0, i, escapeEnd));
j = i; j = i;
escaped = !escaped; escaped = !escaped;
} }
r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + startLen : 0, -1, escapeEnd)); r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + endLen : 0, -1, escapeEnd));
return (Text[])r.toArray(new Text[r.size()]); return (Text[])r.toArray(new Text[r.size()]);
} }

Loading…
Cancel
Save