- added first version of new wiki-parser

- added blacklist support to manual URLFetcher stack fill - fix for NPE: http://www.yacy-forum.de/viewtopic.php?t=3559 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3385 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · 0c7b8cf632
parent f7803a6ce4
commit 0c7b8cf632
11 changed files with 779 additions and 11 deletions
--- a/htroot/CrawlURLFetchStack_p.html
+++ b/htroot/CrawlURLFetchStack_p.html
@ -53,6 +53,7 @@
            <input type="file" name="upload" id="upload" /> #(uploadError)#::&nbsp;<span class="error">No file entered for upload</span>#(/uploadError)#<br />
            <input type="radio" name="uploadType" id="plain" value="plain" checked="checked" /> <label for="plain">Plain text, line-seperated</label><br />
            <input type="radio" name="uploadType" id="html" value="html" /> <label for="html">HTML file, links will be added</label><br />
+            <input type="checkbox" name="blacklistCheck" id="blacklistCheck" checked="checked" /> <label for="blacklistCheck">Don't add URLs matching blacklists active for crawler</label><br />
            <input type="submit" name="subupload" value="Upload File" />
            #(upload)#::
            <span class="success">Added #[added]# and rejected #[failed]# URLs from uploaded file successfully</span>::
--- a/htroot/CrawlURLFetchStack_p.java
+++ b/htroot/CrawlURLFetchStack_p.java
@ -58,6 +58,7 @@ import de.anomic.http.httpHeader;
 import de.anomic.net.URL;
 import de.anomic.plasma.plasmaCrawlNURL;
 import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.urlPattern.plasmaURLPattern;
 import de.anomic.server.serverFileUtils;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
@ -79,6 +80,7 @@ public class CrawlURLFetchStack_p {
    }
    
    public static final String STREAM_CMD_ADDURLS_      = "ADD URLS: ";
+    public static final String STREAM_CMD_ADDURLSBLCHK_ = "ADD URLS CHECK BLACKLIST: ";
    public static final String STREAM_CMD_END           = "END";
    public static final String STREAM_RESP_OK_ADDURLS_  = "FAILED URLS: ";
    public static final String STREAM_RESP_OK           = "OK";
@ -100,6 +102,7 @@ public class CrawlURLFetchStack_p {
            String line;
            int addurls = 0, cururl = 0;
            boolean[] status = new boolean[0];
+            boolean blchk = false;
            URLFetcherStack stack = getURLFetcherStack(env);
            try {
                while ((line = inrb.readLine()) != null) {
@ -109,6 +112,17 @@ public class CrawlURLFetchStack_p {
                            addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLS_.length()));
                            status = new boolean[addurls];
                            cururl = 0;
+                            blchk = false;
+                            outw.println(STREAM_RESP_OK);
+                        } catch (NumberFormatException e) {
+                            outw.println(STREAM_RESP_FAILED);
+                        }
+                    } else if (line.startsWith(STREAM_CMD_ADDURLSBLCHK_)) {
+                        try {
+                            addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLSBLCHK_.length()));
+                            status = new boolean[addurls];
+                            cururl = 0;
+                            blchk = true;
                            outw.println(STREAM_RESP_OK);
                        } catch (NumberFormatException e) {
                            outw.println(STREAM_RESP_FAILED);
@ -117,7 +131,7 @@ public class CrawlURLFetchStack_p {
                        break;
                    } else {
                        if (cururl < addurls)       // add url
-                            status[cururl++] = addURL(line, stack);
+                            status[cururl++] = addURL(line, blchk, stack);
                        
                        if (cururl > 0 && cururl == addurls ) {
                            // done with parsing the passed URL count, now some status output: i.e. 'FAILED URLS: 5 of 8'
@ -178,8 +192,9 @@ public class CrawlURLFetchStack_p {
                        final String content = new String((byte[])post.get("upload$file"));
                        
                        final String type = post.get("uploadType", "");
+                        final boolean blCheck = post.containsKey("blacklistCheck");
                        if (type.equals("plain")) {
-                            prop.put("upload_added", addURLs(content.split("\n"), getURLFetcherStack(env)));
+                            prop.put("upload_added", addURLs(content.split("\n"), blCheck, getURLFetcherStack(env)));
                            prop.put("upload_failed", 0);
                            prop.put("upload", 1);
                        } else if (type.equals("html")) {
@ -191,10 +206,14 @@ public class CrawlURLFetchStack_p {
                                
                                final Iterator it = ((HashMap)scraper.getAnchors()).keySet().iterator();
                                int added = 0, failed = 0;
-                                String url;
+                                URL url;
                                while (it.hasNext()) try {
-                                    url = (String)it.next();
-                                    getURLFetcherStack(env).push(new URL(url));
+                                    url = new URL((String)it.next());
+                                    if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url)) {
+                                        failed++;
+                                        continue;
+                                    }
+                                    getURLFetcherStack(env).push(url);
                                    added++;
                                } catch (MalformedURLException e) { failed++; }
                                prop.put("upload", 1);
@ -235,17 +254,19 @@ public class CrawlURLFetchStack_p {
        prop.put("peers", count);
    }
    
-    private static int addURLs(String[] urls, URLFetcherStack stack) {
+    private static int addURLs(String[] urls, boolean blCheck, URLFetcherStack stack) {
        int count = -1;
        for (int i=0; i<urls.length; i++)
-            if (addURL(urls[i], stack)) count++;
+            if (addURL(urls[i], blCheck, stack)) count++;
        return count;
    }
    
-    private static boolean addURL(String url, URLFetcherStack stack) {
+    private static boolean addURL(String url, boolean blCheck, URLFetcherStack stack) {
        try {
            if (url == null || url.length() == 0) return false;
-            stack.push(new URL(url));
+            URL u = new URL(url);
+            if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, u)) return false;
+            stack.push(u);
            return true;
        } catch (MalformedURLException e) { return false; }
    }
--- a/htroot/IndexCreateIndexingQueue_p.java
+++ b/htroot/IndexCreateIndexingQueue_p.java
@ -146,9 +146,9 @@ public class IndexCreateIndexingQueue_p {

                    boolean inProcess = i < inProcessCount;
                    pcentry = (plasmaSwitchboardQueue.Entry) entryList.get(i);
+                    if ((pcentry != null)&&(pcentry.url() != null)) {
                        long entrySize = pcentry.size();
                        totalSize += entrySize;
-                    if ((pcentry != null)&&(pcentry.url() != null)) {
                        initiator = yacyCore.seedDB.getConnected(pcentry.initiator());
                        prop.put("indexing-queue_list_"+entryCount+"_dark", (inProcess)? 2: ((dark) ? 1 : 0));
                        prop.put("indexing-queue_list_"+entryCount+"_initiator", ((initiator == null) ? "proxy" : wikiCode.replaceHTML(initiator.getName())));
--- a/source/de/anomic/data/wiki/tokens/AbstractToken.java
+++ b/source/de/anomic/data/wiki/tokens/AbstractToken.java
@ -0,0 +1,23 @@
+
+package de.anomic.data.wiki.tokens;
+
+public abstract class AbstractToken implements Token {
+	
+	protected String text = null;
+	protected String markup = null;
+	protected boolean parsed = false;
+	
+	protected abstract boolean parse();
+	
+	public String getMarkup() {
+		if (this.text == null)
+			throw new IllegalArgumentException();
+		if (!this.parsed && !parse()) return this.text;
+		return this.markup;
+	}
+	
+	public String getText() { return this.text; }
+	
+	@Override
+	public String toString() { return getMarkup(); }
+}
--- a/source/de/anomic/data/wiki/tokens/DefinitionListToken.java
+++ b/source/de/anomic/data/wiki/tokens/DefinitionListToken.java
@ -0,0 +1,42 @@
+
+package de.anomic.data.wiki.tokens;
+
+public class DefinitionListToken extends ListToken {
+	
+	private static final String[] blockElements = { "dl", "dt", "dd" };
+	
+	public DefinitionListToken() {
+		super(';', null, null);
+	}
+	
+	@Override
+	protected StringBuffer parse(String[] t, int depth, StringBuffer sb) {
+		sb.append("<dl>\n");
+		while (super.aktline < t.length && getGrade(t[super.aktline]) >= depth) {
+			for (int j=0; j<depth + 1; j++) sb.append("\t");
+			sb.append("<dt>");
+			
+			if (getGrade(t[super.aktline]) > depth) {
+				parse(t, depth + 1, sb);
+			} else {
+				sb.append(t[super.aktline].substring(depth + 1).replaceFirst(":", "</dt><dd>"));
+			}
+			
+			sb.append("</");
+			if (t[super.aktline].indexOf(':') == -1 || getGrade(t[super.aktline]) > depth)
+				sb.append("dt");
+			else
+				sb.append("dd");
+			sb.append(">\n");
+			super.aktline++;
+		}
+		for (int j=0; j<depth; j++) sb.append("\t");
+		sb.append("</dl>");
+		super.aktline--;
+		return sb;
+	}
+	
+	public String[] getBlockElementNames() {
+		return blockElements;
+	}
+}
--- a/source/de/anomic/data/wiki/tokens/LinkToken.java
+++ b/source/de/anomic/data/wiki/tokens/LinkToken.java
@ -0,0 +1,106 @@
+package de.anomic.data.wiki.tokens;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class LinkToken extends AbstractToken {
+	
+	private static final int IMG = 0;
+	private static final int INT = 1;
+	private static final int EXT = 2;
+	
+	private static final Pattern imgPattern = Pattern.compile(
+			"\\[\\[" +											// begin
+			"(Image:([^\\]|]|\\][^\\]])*)" +					// "Image:" + URL
+			"(" +												// <optional>
+				"(\\|(bottom|left|center|right|middle|top))?" +	// optional align
+				"(\\|(([^\\]]|\\][^\\]])*))" +					// description
+			")?" +												// </optional>
+			"\\]\\]");											// end
+	
+	private static final Pattern intPattern = Pattern.compile(
+			"\\[\\[" +											// begin
+			"(([^\\]|]|\\][^\\]])*?)" +							// wiki-page
+			"(\\|(([^\\]]|\\][^\\]])*?))?" +					// optional desciption
+			"\\]\\]");											// end
+	
+	private static final Pattern extPattern = Pattern.compile(
+			"\\[" +												// begin
+			"([^\\] ]*)" +										// URL
+			"( ([^\\]]*))?" +									// optional description
+			"\\]");												// end
+	
+	private static final Pattern[] patterns = new Pattern[] {
+		imgPattern, intPattern, extPattern };
+	
+	private final String localhost;
+	private final String wikiPath;
+	private int patternNr = 0;
+	
+	public LinkToken(String localhost, String wikiPath) {
+		this.localhost = localhost;
+		this.wikiPath = wikiPath;
+	}
+	
+	@Override
+	protected boolean parse() {
+		StringBuilder sb = new StringBuilder();
+		Matcher m;
+		switch (this.patternNr) {
+			case IMG:
+				m = imgPattern.matcher(this.text);
+				if (!m.find()) return false;
+				sb.append("<img src=\"").append(formatLink(m.group(1))).append("\"");
+				if (m.group(5) != null) sb.append(" align=\"").append(m.group(5)).append("\"");
+				if (m.group(7) != null) sb.append(" alt=\"").append(m.group(7)).append("\"");
+				sb.append(" />");
+				break;
+				
+			case INT:
+				m = intPattern.matcher(this.text);
+				if (!m.find()) return false;
+				sb.append("<a href=\"").append("http://").append(this.localhost)
+						.append("/").append(this.wikiPath).append(m.group(1))
+						.append("\"");
+				if (m.group(4) != null) sb.append(" title=\"").append(m.group(3)).append("\"");
+				sb.append(">");
+				if (m.group(4) != null) sb.append(m.group(4)); else sb.append(m.group(1));
+				sb.append("</a>");
+				break;
+				
+			case EXT:
+				m = extPattern.matcher(this.text);
+				if (!m.find()) return false;
+				sb.append("<a href=\"").append(formatLink(m.group(1))).append("\"");
+				if (m.group(3) != null) sb.append(" title=\"").append(m.group(3)).append("\"");
+				sb.append(">");
+				if (m.group(3) != null) sb.append(m.group(3)); else sb.append(m.group(1));
+				sb.append("</a>");
+				break;
+				
+			default: return false;
+		}
+		this.parsed = true;
+		this.markup = new String(sb);
+		return true;
+	}
+	
+	private String formatLink(String link) {
+		if (link.indexOf("://") == -1) {		// DATA/HTDOCS-link
+			return "http://" + this.localhost + "/" + link;
+		} else {								// 'normal' link
+			return link;
+		}
+	}
+	
+	public String[] getBlockElementNames() { return null; }
+	public Pattern[] getRegex() { return patterns; }
+	
+	public boolean setText(String text, int patternNr) {
+		this.text = text;
+		this.patternNr = patternNr;
+		this.parsed = false;
+		if (text == null) { this.markup = null; this.patternNr = -1; }
+		return true;
+	}
+}
--- a/source/de/anomic/data/wiki/tokens/ListToken.java
+++ b/source/de/anomic/data/wiki/tokens/ListToken.java
@ -0,0 +1,118 @@
+package de.anomic.data.wiki.tokens;
+
+
+import java.util.ArrayList;
+import java.util.regex.Pattern;
+
+public class ListToken extends AbstractToken {
+	
+	protected final String[] blockElements;
+	
+	protected final char firstChar;
+	protected final String listBlockElement;
+	protected final String listElement;
+	protected final boolean recursion;
+	protected final Pattern[] pattern;
+	
+	protected int aktline = 0;
+	
+	public ListToken(char firstChar, String listBlockElement) {
+		this.firstChar = firstChar;
+		this.listBlockElement = listBlockElement;
+		this.listElement = "li";
+		this.recursion = true;
+		this.pattern = new Pattern[] { Pattern.compile("^[" + firstChar + "]([^\n]|\n[" + firstChar + "])*", Pattern.MULTILINE) };
+		ArrayList<String> r = new ArrayList<String>();
+		if (this.listBlockElement != null) {
+			if (this.recursion) r.add(this.listBlockElement);
+			if (this.listElement != null) r.add(this.listElement);
+		}
+		blockElements = r.toArray(new String[r.size()]);
+	}
+	
+	public ListToken(char firstChar, String listBlockElement, String listElement) {
+		this.firstChar = firstChar;
+		this.listBlockElement = listBlockElement;
+		this.listElement = listElement;
+		this.recursion = true;
+		this.pattern = new Pattern[] { Pattern.compile("^[" + firstChar + "]([^\n]|\n[" + firstChar + "])*", Pattern.MULTILINE) };
+		ArrayList<String> r = new ArrayList<String>();
+		if (this.listBlockElement != null) {
+			if (this.recursion) r.add(this.listBlockElement);
+			if (this.listElement != null) r.add(this.listElement);
+		}
+		blockElements = r.toArray(new String[r.size()]);
+	}
+	
+	public ListToken(char firstChar, String listBlockElement, String listElement, boolean recursion) {
+		this.firstChar = firstChar;
+		this.listBlockElement = listBlockElement;
+		this.listElement = listElement;
+		this.recursion = recursion;
+		this.pattern = new Pattern[] { Pattern.compile("^[" + firstChar + "]([^\n]|\n[" + firstChar + "])*", Pattern.MULTILINE) };
+		ArrayList<String> r = new ArrayList<String>();
+		if (this.listBlockElement != null) {
+			if (this.recursion) r.add(this.listBlockElement);
+			if (this.listElement != null) r.add(this.listElement);
+		}
+		blockElements = r.toArray(new String[r.size()]);
+	}
+	
+	@Override
+	protected boolean parse() {
+		StringBuffer sb = new StringBuffer(this.text.length());
+		parse(this.text.split("\n"), 0, sb);
+		this.markup = new String(sb);
+		this.parsed = true;
+		return true;
+	}
+	
+	protected StringBuffer parse(String[] t, int depth, StringBuffer sb) {
+		if (this.listBlockElement != null) sb.append("<").append(this.listBlockElement).append(">\n");
+		while (this.aktline < t.length && getGrade(t[this.aktline]) >= depth) {
+			if (recursion) for (int j=0; j<depth + 1; j++) sb.append("\t");
+			if (this.listElement != null) sb.append("<").append(this.listElement).append(">");
+			
+			if (this.recursion && getGrade(t[this.aktline]) > depth) {
+				parse(t, depth + 1, sb);
+			} else {
+				sb.append(t[this.aktline].substring(depth + 1));
+			}
+			
+			if (this.listElement != null) sb.append("</").append(this.listElement).append(">");
+			sb.append("\n");
+			this.aktline++;
+		}
+		if (this.recursion) for (int j=0; j<depth; j++) sb.append("\t");
+		if (this.listBlockElement != null) sb.append("</").append(this.listBlockElement).append(">");
+		this.aktline--;
+		return sb;
+	}
+	
+	protected int getGrade(String t) {
+		int i = 0;
+		for (i=0; i<t.length(); i++)
+			if (t.charAt(i) != this.firstChar) break;
+		return i - 1;
+	}
+	
+	public String[] getBlockElementNames() {
+		return blockElements;
+	}
+	
+	public Pattern[] getRegex() {
+		return this.pattern;
+	}
+	
+	public char getFirstChar() {
+		return this.firstChar;
+	}
+	
+	public boolean setText(String text, int patternNr) {
+		this.text = text;
+		this.markup = null;
+		this.parsed = false;
+		this.aktline = 0;
+		return true;
+	}
+}
--- a/source/de/anomic/data/wiki/tokens/SimpleToken.java
+++ b/source/de/anomic/data/wiki/tokens/SimpleToken.java
@ -0,0 +1,120 @@
+
+package de.anomic.data.wiki.tokens;
+
+
+import java.util.ArrayList;
+import java.util.regex.MatchResult;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class SimpleToken extends AbstractToken {
+	
+	protected String content = null;
+	protected int grade = 0;
+	
+	protected final Pattern[] pattern;
+	protected MatchResult mresult = null;
+	private final String[][] definitionList;
+	private final String[] blockElements;
+	
+	public SimpleToken(char firstChar, char lastChar, String[][] definitionList, boolean isBlockElements) {
+		this.definitionList = definitionList;
+		int i;
+		if (isBlockElements) {
+			ArrayList<String> r = new ArrayList<String>();
+			int j;
+			for (i=0; i<definitionList.length; i++)
+				if (definitionList[i] != null)
+					for (j=0; j<definitionList[i].length; j++)
+						r.add(definitionList[i][j]);
+			this.blockElements = r.toArray(new String[r.size()]);
+		} else {
+			this.blockElements = null;
+		}
+		
+		for (i=0; i<definitionList.length; i++)
+			if (definitionList[i] != null) {
+				i++;
+				break;
+			}
+		this.pattern = new Pattern[] { Pattern.compile(String.format(
+				"([\\%s]{%d,%d})(.*?)([\\%s]{%d,%d})",
+				firstChar, i, definitionList.length,
+				lastChar, i, definitionList.length)) };
+	}
+	
+	@Override
+	public String getMarkup() {
+		if (this.content == null) {
+			if (this.text == null) {
+				throw new IllegalArgumentException();
+			} else {
+				setText(this.text, 0);
+			}
+		}
+		if (!this.parsed && !parse()) return this.text;
+		return this.markup;
+	}
+	
+	protected boolean parse() {
+		String[] e;
+		if ((e = definitionList[this.grade]) == null || definitionList.length <= this.grade) {
+			System.err.println("token not defined for grade: " + this.grade);
+			return false;
+		}
+		this.markup = getMarkup(e);
+		this.parsed = true;
+		return true;
+	}
+	
+	protected String getMarkup(String[] es) {
+		return getMarkup(es, false) + this.content + getMarkup(es, true);
+	}
+	
+	protected String getMarkup(String[] es, boolean closing) {
+		StringBuffer result = new StringBuffer();
+		// backwards if closing
+		for (
+				int i = (closing) ? es.length - 1 : 0, j;
+				(closing && i >= 0) ^ (!closing && i < es.length);
+				i += (closing) ? -1 : +1
+		) {
+			result.append("<");
+			if (closing) {
+				result.append("/");
+				if ((j = es[i].indexOf(' ')) > -1) {
+					result.append(es[i].substring(0, j));
+				} else {
+					result.append(es[i]);
+				}
+			} else {
+				result.append(es[i]);
+			}
+			result.append(">");
+		}
+		return new String(result);
+	}
+	
+	public boolean setText(String text, int patternNr) {
+		this.text = text;
+		this.markup = null;
+		this.parsed = false;
+		if (text != null) {
+			Matcher m = getRegex()[0].matcher(text);
+			if (
+					(m.matches()) &&
+					(m.group(1).length() == m.group(3).length()) &&
+					(definitionList.length >= m.group(1).length()) &&
+					(definitionList[m.group(1).length() - 1] != null)
+			) {
+				this.grade = m.group(1).length() - 1;
+				this.content = m.group(2);
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	public Pattern[] getRegex() { return this.pattern; }
+	public String[] getBlockElementNames() { return this.blockElements; }
+}
--- a/source/de/anomic/data/wiki/tokens/TableToken.java
+++ b/source/de/anomic/data/wiki/tokens/TableToken.java
@ -0,0 +1,118 @@
+package de.anomic.data.wiki.tokens;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.regex.Pattern;
+
+public class TableToken extends AbstractToken {
+	
+	private static final Pattern[] pattern = new Pattern[] {
+		Pattern.compile(
+				"\\{\\|" +					// "{|"
+				"([^\n]|\n\\|[|-])*\n" +	// new line must start with "||" or "|-"
+				"\\|\\}")					// "|}"
+	};
+	private static final String[] blockElementNames = new String[] { "table", "tr", "td" };
+	
+	@Override
+	protected boolean parse() {
+		String[] t = text.split("\n");
+		String[] tds;
+		StringBuffer sb = new StringBuffer();
+		sb.append("<table");
+		if (t[0].length() > 2) sb.append(parseTableProperties(t[0].substring(2)));
+		sb.append(">\n");
+		boolean trOpen = false;
+		for (int i=1, j, a; i<t.length-1; i++) {
+			if (t[i].startsWith("|-")) {
+				if (trOpen) sb.append("\t</tr>\n");
+				if (trOpen = (i < t.length - 2)) sb.append("\t<tr>\n");
+			} else if (t[i].startsWith("||")) {
+				tds = t[i].split("\\|\\|");
+				for (j=0; j<tds.length; j++) {
+					if (tds[j].length() > (a = tds[j].indexOf('|')) + 1) {	// don't print empty td's
+						sb.append("\t\t<td");
+						if (a > -1) sb.append(parseTableProperties(tds[j].substring(0, a)));
+						sb.append(">").append(tds[j].substring(a + 1)).append("</td>\n");
+					}
+				}
+			}
+		}
+		if (trOpen) sb.append("\t</tr>\n");
+		this.markup =  new String(sb.append("</table>"));
+		this.parsed = true;
+		return true;
+	}
+	
+    // from de.anomic.data.wikiCode.java.parseTableProperties, modified by [FB]
+	private static final String[] tps = { "rowspan", "colspan", "vspace", "hspace", "cellspacing", "cellpadding", "border" };
+    private static final HashMap<String,String[]> ps = new HashMap<String,String[]>();
+    static {
+    	ps.put("frame", 	new String[] { "void", "above", "below", "hsides", "lhs", "rhs", "vsides", "box", "border" });
+    	ps.put("rules", 	new String[] { "none", "groups", "rows", "cols", "all" });
+    	ps.put("valign", 	new String[] { "top", "middle", "bottom", "baseline" });
+    	ps.put("align", 	new String[] { "left", "right", "center" });
+    }
+    
+	// contributed by [MN]
+    /** This method takes possible table properties and tests if they are valid.
+      * Valid in this case means if they are a property for the table, tr or td
+      * tag as stated in the HTML Pocket Reference by Jennifer Niederst (1st edition)
+      * The method is important to avoid XSS attacks on the wiki via table properties.
+      * @param str A string that may contain several table properties and/or junk.
+      * @return A string that only contains table properties.
+      */
+    private static StringBuffer parseTableProperties(final String properties){
+        String[] values = properties.replaceAll("&quot;", "").split("[= ]");     //splitting the string at = and blanks
+        StringBuffer sb = new StringBuffer(properties.length());
+        Iterator<String> it;
+        String key, valkey, value;
+        int numberofvalues = values.length;
+        main: for (int i=0; i<numberofvalues; i++) {
+        	valkey = values[i].trim();
+        	if (i + 1 < numberofvalues) {
+        		value = values[++i].trim();
+        		if (
+        				valkey.equals("summary") ||
+        				(valkey.equals("bgcolor") && value.matches("#{0,1}[0-9a-fA-F]{1,6}|[a-zA-Z]{3,}")) ||
+        				((valkey.equals("width") || valkey.equals("height")) && value.matches("\\d+%{0,1}")) ||
+        				(isInArray(tps, valkey) && value.matches("\\d+"))
+        		) {
+                	addPair(valkey, value, sb);
+                	continue;
+        		}
+        		it = ps.keySet().iterator();
+        		while (it.hasNext()) {
+        			key = it.next();
+        			if (valkey.equals(key) && isInArray(ps.get(key), value)) {
+        				addPair(valkey, value, sb);
+        				continue main;
+        			}
+        		}
+        	}
+            if (valkey.equals("nowrap"))
+                sb.append(" nowrap");
+        }
+        return sb;
+    }
+    
+    private static StringBuffer addPair(String val1, String val2, StringBuffer sb) {
+    	return sb.append(" ").append(val1).append("=\"").append(val2).append("\"");
+    }
+    
+    private static boolean isInArray(Object[] array, Object find) {
+    	for (int i=array.length-1; i>-1; i--)
+    		if (array[i].equals(find)) return true;
+    	return false;
+    }
+	
+	public Pattern[] getRegex() { return pattern; }
+	public String[] getBlockElementNames() { return blockElementNames; }
+	
+	public boolean setText(String text, int patternNr) {
+		this.text = text;
+		this.parsed = false;
+		this.markup = null;
+		return true;
+	}
+}
--- a/source/de/anomic/data/wiki/tokens/Token.java
+++ b/source/de/anomic/data/wiki/tokens/Token.java
@ -0,0 +1,13 @@
+
+package de.anomic.data.wiki.tokens;
+
+import java.util.regex.Pattern;
+
+public interface Token {
+	
+	public Pattern[] getRegex();
+	public boolean setText(String text, int patternNr);
+	public String getText();
+	public String getMarkup();
+	public String[] getBlockElementNames();
+}
--- a/source/de/anomic/data/wiki/wikiParser.java
+++ b/source/de/anomic/data/wiki/wikiParser.java
@ -0,0 +1,206 @@
+package de.anomic.data.wiki;
+
+
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+
+import de.anomic.data.wiki.tokens.DefinitionListToken;
+import de.anomic.data.wiki.tokens.LinkToken;
+import de.anomic.data.wiki.tokens.ListToken;
+import de.anomic.data.wiki.tokens.SimpleToken;
+import de.anomic.data.wiki.tokens.TableToken;
+import de.anomic.data.wiki.tokens.Token;
+
+public class wikiParser {
+	
+	public static final Token[] tokens = {
+		new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true),
+		new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
+		new LinkToken("localhost:8080", "Wiki.html?page="),
+		new ListToken('*', "ul"),
+		new ListToken('#', "ol"),
+		new ListToken(':', "blockquote", null),
+		new ListToken(' ', null, "tt", false),
+		new DefinitionListToken(),
+		new TableToken()
+	};
+	
+	private static final String[] BEs;
+	static {
+		ArrayList<String> r = new ArrayList<String>();
+		for (int i=0, k, j; i<tokens.length; i++)
+			if (tokens[i].getBlockElementNames() != null)
+				for (j=0; j<tokens[i].getBlockElementNames().length; j++) {
+					if (tokens[i].getBlockElementNames()[j] == null) continue;
+					if ((k = tokens[i].getBlockElementNames()[j].indexOf(' ')) > 1) {
+						r.add(tokens[i].getBlockElementNames()[j].substring(0, k));
+					} else {
+						r.add(tokens[i].getBlockElementNames()[j]);
+					}
+				}
+		r.add("hr");
+		BEs = r.toArray(new String[r.size()]);
+	}
+	
+	public static void main(String[] args) {
+		String text = "===Title===\n" +
+				"==blubb[== was ==ein '''shice'''==...och.bla\n" +
+				"* ein \n" +
+				"*==test==\n" +
+				"** doppelt\n" +
+				"* ''tess*sst''\n" +
+				"*** xyz\n" +
+				"=]*** huch\n" +
+				"* ehehe***\n" +
+				"* blubb\n" +
+				"bliblablo\n\n\n" +
+				"* blubb\n" +
+				"{|border=-1\n" +
+				"|-\n" +
+				"||bla|| blubb\n" +
+				"|-\n" +
+				"||align center|och||huch||\n" +
+				"|}\n" +
+				"\n" +
+				"# bla\n" +
+				"# blubb\n" +
+				"'''''ehehehe''''', ne?!\n" +
+				"[http://www/index.html,ne?!] -\n" +
+				"[[Image:blubb|BLA]] ---- och\n" +
+				" blubb1\n" +
+				" blubb2\n" +
+				":doppel-blubb[= huch =]\n" +
+				";hier:da\n" +
+				";dort:und so\n" +
+				";;und:doppelt";
+		// text = "[=\n=]* bla";
+		String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar," +
+				"[=denk ich=] mal =]";
+		long l = System.currentTimeMillis();
+		t = parse((args.length > 0) ? args[0] : text);
+        System.out.println("parsing time: " + (System.currentTimeMillis() - l) + " ms");
+        System.out.println("--- --- ---");
+        System.out.println(t);
+	}
+	
+	// TODO:
+	// - preParse:
+	//   - <pre>~</pre>
+	
+	public static String parse(String text) {
+        Text[] tt = Text.split2Texts(text, "[=", "=]");
+        for (int i=0; i<tt.length; i+=2)
+        	tt[i].setText(parseUnescaped(tt[i].getText()));
+        return replaceBRs(Text.mergeTexts(tt));
+	}
+	
+	public static String parseUnescaped(String text) {
+		Token st;
+		Matcher m;
+		StringBuffer sb;
+		for (int i=0; i<tokens.length; i++) {
+			st = tokens[i];
+			for (int j=0; j<st.getRegex().length; j++) {
+				m = st.getRegex()[j].matcher(text);
+				sb = new StringBuffer();
+				while (m.find()) {
+					//System.out.print("found " + st.getClass().getSimpleName() +  ": " +
+					//		m.group().replaceAll("\n", "\\\\n").replaceAll("\t", "    ") + ", ");
+					if (st.setText(m.group(), j)) {
+					//	System.out.println("usable");
+					} else {
+					//	System.out.println("not usable");
+						continue;
+					}
+					m.appendReplacement(sb, (st.getMarkup() == null) ? m.group() : st.getMarkup());
+				}
+				text = new String(m.appendTail(sb));
+			}
+		}
+		return text.replaceAll("----", "<hr />");
+	}
+	
+	private static String replaceBRs(String text) {
+		StringBuffer sb = new StringBuffer(text.length());
+		String[] tt = text.split("\n");
+		boolean replace;
+		for (int i=0, j; i<tt.length; i++) {
+			replace = true;
+			for (j=0; j<BEs.length; j++)
+				if (tt[i].endsWith(BEs[j] + ">")) { replace = false; break; }
+			sb.append(tt[i]);
+			if (replace && i < tt.length - 1) sb.append("<br />");
+			if (i < tt.length - 1) sb.append("\n");
+		}
+		return new String(sb);
+	}
+	
+	private static class Text {
+		
+		public static final String escapeNewLine = "@";
+		
+		private String text;
+		private final boolean escaped;
+		private final boolean nl;
+		
+		public Text(String text, boolean escaped, boolean newLineBefore) {
+			this.text = text;
+			this.escaped = escaped;
+			this.nl = newLineBefore;
+		}
+		
+		public String setTextPlain(String text) { return this.text = text; }
+		public String setText(String text) {
+			if (this.nl)
+				this.text = text.substring(escapeNewLine.length());
+			else
+				this.text = text;
+			return this.text;
+		}
+		
+		public String getTextPlain() { return this.text; }
+		public String getText() {
+			if (this.nl)
+				return escapeNewLine + this.text;
+			else
+				return this.text;
+		}
+		
+		public String toString() { return this.text; }
+		public boolean isEscaped() { return this.escaped; }
+		public boolean isNewLineBefore() { return this.nl; }
+		
+		private static Text[] split2Texts(String text, String escapeBegin, String escapeEnd) {
+			if (text == null) return null;
+			if (text.length() < 2) return new Text[] { new Text(text, false, true) };
+			
+			int startLen = escapeBegin.length();
+			ArrayList<Text> r = new ArrayList<Text>();
+			boolean escaped = text.startsWith(escapeBegin);
+			if (escaped) r.add(new Text("", false, true));
+			int i, j = 0;
+			while ((i = text.indexOf((escaped) ? escapeEnd : escapeBegin, j)) > -1) {
+				r.add(resolve2Text(text, escaped, (j > 0) ? j + startLen : 0, i, escapeEnd));
+				j = i;
+				escaped = !escaped;
+			}
+			r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + startLen : 0, -1, escapeEnd));
+			return r.toArray(new Text[r.size()]);
+		}
+		
+		private static Text resolve2Text(String text, boolean escaped, int from, int to, String escapeEnd) {
+			if (to == -1) to = text.length();
+			return new Text(
+					text.substring(from, to),
+					escaped,
+					from < escapeEnd.length() + 2 || (!escaped && text.charAt(from - escapeEnd.length() - 1) == '\n'));
+		}
+		
+		private static String mergeTexts(Text[] texts) {
+			StringBuffer sb = new StringBuffer();
+			for (int n=0; n < texts.length; n++)
+				sb.append(texts[n].getTextPlain());
+			return new String(sb);
+		}
+	}
+}