*) cleaned up code for better readability

*) added a few copyright notices
*) removed redundancy in constructors of ListToken

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6295 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
low012 16 years ago
parent eaddf2d464
commit 248f3fd9b5

@ -4,9 +4,9 @@
//
// (C) 2009 by Marc Nause
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by

@ -4,9 +4,9 @@
//
// (C) 2009 by Marc Nause
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by

@ -1,3 +1,28 @@
// abstractWikiParser.java
// ---------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2007
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data.wiki;
import java.io.BufferedReader;

@ -1,4 +1,4 @@
// wikiParser.java
// knwikiParser.java
// ---------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
@ -8,9 +8,9 @@
//
// This file is contributed by Franz Brausze
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -42,55 +42,55 @@ import de.anomic.search.Switchboard;
public class knwikiParser implements wikiParser {
public Token[] tokens;
private String[] BEs;
public Token[] tokens;
private String[] BEs;
private final Switchboard sb;
private knwikiParser(final Switchboard sb) {
this.sb = sb;
}
public static void main(final String[] args) {
final String text = "===T<pre>itle===\n" +
"==blubb== was ==ein '''shice'''==...och.bla\n" +
"* ein \n" +
"*==test=</pre>=\n" +
"** doppelt\n" +
"* ''tess*sst''\n" +
"*** xyz\n" +
"=]*** huch\n" +
"* ehehe***\n" +
"* blubb\n" +
"bliblablo\n\n\n" +
"* blubb\n" +
"{|border=-1\n" +
"|-\n" +
"||bla|| blubb\n" +
"|-\n" +
"||align center|och||huch||\n" +
"|}\n" +
"\n" +
"# bla\n" +
"# blubb\n" +
"'''''ehehehe''''', ne?!\n" +
"[http://www/index.html,ne?!] -\n" +
"[[Image:blubb|BLA]] ---- och\n" +
" blubb1\n" +
" blubb2\n" +
":doppel-blubb[= huch =]\n" +
";hier:da\n" +
";dort:und so\n" +
";;und:doppelt\n\n\n\n" +
"[[Image:blubb|BLA]]";
// text = "[=\n=]* bla";
String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar," +
"[=denk ich=] mal =]";
final long l = System.currentTimeMillis();
t = new knwikiParser(null).parse((args.length > 0) ? args[0] : text, "localhost:8080");
public static void main(final String[] args) {
final String text = "===T<pre>itle===\n" +
"==blubb== was ==ein '''shice'''==...och.bla\n" +
"* ein \n" +
"*==test=</pre>=\n" +
"** doppelt\n" +
"* ''tess*sst''\n" +
"*** xyz\n" +
"=]*** huch\n" +
"* ehehe***\n" +
"* blubb\n" +
"bliblablo\n\n\n" +
"* blubb\n" +
"{|border=-1\n" +
"|-\n" +
"||bla|| blubb\n" +
"|-\n" +
"||align center|och||huch||\n" +
"|}\n" +
"\n" +
"# bla\n" +
"# blubb\n" +
"'''''ehehehe''''', ne?!\n" +
"[http://www/index.html,ne?!] -\n" +
"[[Image:blubb|BLA]] ---- och\n" +
" blubb1\n" +
" blubb2\n" +
":doppel-blubb[= huch =]\n" +
";hier:da\n" +
";dort:und so\n" +
";;und:doppelt\n\n\n\n" +
"[[Image:blubb|BLA]]";
// text = "[=\n=]* bla";
String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar," +
"[=denk ich=] mal =]";
final long l = System.currentTimeMillis();
t = new knwikiParser(null).parse((args.length > 0) ? args[0] : text, "localhost:8080");
System.out.println("parsing time: " + (System.currentTimeMillis() - l) + " ms");
System.out.println("--- --- ---");
System.out.println(t);
}
}
public String transform(final String content) {
return parse(content, null);
@ -105,7 +105,7 @@ public class knwikiParser implements wikiParser {
return parse(new String(content, encoding), null);
}
private String parse(String text, final String publicAddress) {
private String parse(String text, final String publicAddress) {
tokens = new Token[] {
new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true),
new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
@ -133,121 +133,127 @@ public class knwikiParser implements wikiParser {
Text[] tt = Text.split2Texts(text, "[=", "=]");
for (int i=0; i<tt.length; i+=2)
tt[i].setText(parseUnescaped(tt[i].getText()));
tt[i].setText(parseUnescaped(tt[i].getText()));
text = Text.mergeTexts(tt);
tt = Text.split2Texts(text, "<pre>", "</pre>");
for (int i=0; i<tt.length; i+=2)
tt[i].setText(replaceBRs(tt[i].getText()));
return Text.mergeTexts(tt);
}
private String parseUnescaped(String text) {
Token st;
Matcher m;
StringBuffer sb;
for (int i=0; i<tokens.length; i++) {
st = tokens[i];
for (int j=0; j<st.getRegex().length; j++) {
m = st.getRegex()[j].matcher(text);
sb = new StringBuffer();
while (m.find()) try {
//System.out.print("found " + st.getClass().getSimpleName() + ": " +
// m.group().replaceAll("\n", "\\\\n").replaceAll("\t", " ") + ", ");
if (!st.setText(m.group(), j)) {
// System.out.println("not usable");
continue;
//} else {
// System.out.println("usable");
}
m.appendReplacement(sb, (st.getMarkup() == null) ? m.group() : st.getMarkup());
} catch (final wikiParserException e) {
m.appendReplacement(sb, st.getText());
}
private String parseUnescaped(String text) {
Token st;
Matcher m;
StringBuffer stringBuffer;
for (int i=0; i<tokens.length; i++) {
st = tokens[i];
for (int j=0; j<st.getRegex().length; j++) {
m = st.getRegex()[j].matcher(text);
stringBuffer = new StringBuffer();
while (m.find()) try {
if (!st.setText(m.group(), j)) {
continue;
}
m.appendReplacement(stringBuffer, (st.getMarkup() == null) ? m.group() : st.getMarkup());
} catch (final wikiParserException e) {
m.appendReplacement(stringBuffer, st.getText());
}
text = new String(m.appendTail(sb));
}
}
return text.replaceAll("----", "<hr />");
}
private String replaceBRs(final String text) {
final StringBuilder sb = new StringBuilder(text.length());
final String[] tt = text.split("\n");
boolean replace;
for (int i=0, j; i<tt.length; i++) {
replace = true;
for (j=0; j<BEs.length; j++)
if (tt[i].endsWith(BEs[j] + ">")) { replace = false; break; }
sb.append(tt[i]);
text = new String(m.appendTail(stringBuffer));
}
}
return text.replaceAll("----", "<hr />");
}
private String replaceBRs(final String text) {
final StringBuilder stringBuffer = new StringBuilder(text.length());
final String[] tt = text.split("\n");
boolean replace;
for (int i=0, j; i<tt.length; i++) {
replace = true;
for (j=0; j<BEs.length; j++)
if (tt[i].endsWith(BEs[j] + ">")) { replace = false; break; }
stringBuffer.append(tt[i]);
if (i < tt.length - 1) {
if (replace) sb.append("<br />");
sb.append("\n");
if (replace) stringBuffer.append("<br />");
stringBuffer.append("\n");
}
}
return new String(sb);
}
}
return new String(stringBuffer);
}
private static class Text {
private static class Text {
public static final String escapeNewLine = "@";
public static final String escapeNewLine = "@";
private String text;
private final boolean nl;
private String text;
private final boolean nl;
public Text(final String text, final boolean escaped, final boolean newLineBefore) {
this.text = text;
this.nl = newLineBefore;
public Text(final String text, final boolean escaped, final boolean newLineBefore) {
this.text = text;
this.nl = newLineBefore;
}
public String setText(final String text) {
if (this.nl)
this.text = text.substring(escapeNewLine.length());
else
this.text = text;
return this.text;
}
public String setText(final String text) {
if (this.nl) {
this.text = text.substring(escapeNewLine.length());
} else {
this.text = text;
}
return this.text;
}
public String getTextPlain() { return this.text; }
public String getText() {
if (this.nl)
return escapeNewLine + this.text;
return this.text;
}
public String getTextPlain() {
return this.text;
}
public String toString() { return this.text; }
public String getText() {
if (this.nl) {
return escapeNewLine + this.text;
}
return this.text;
}
@Override
public String toString() {
return this.text;
}
static Text[] split2Texts(final String text, final String escapeBegin, final String escapeEnd) {
static Text[] split2Texts(final String text, final String escapeBegin, final String escapeEnd) {
if (text == null) return null;
if (text.length() < 2) return new Text[] { new Text(text, false, true) };
if (text == null) return null;
final int startLen = escapeBegin.length();
if (text.length() < 2) return new Text[] {new Text(text, false, true) };
final int startLen = escapeBegin.length();
final int endLen = escapeEnd.length();
final ArrayList<Text> r = new ArrayList<Text>();
boolean escaped = text.startsWith(escapeBegin);
if (escaped) r.add(new Text("", false, true));
int i, j = 0;
while ((i = text.indexOf((escaped) ? escapeEnd : escapeBegin, j)) > -1) {
r.add(resolve2Text(text, escaped, (j > 0) ? j + ((escaped) ? startLen : endLen) : 0, i, escapeEnd));
j = i;
escaped = !escaped;
}
r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + endLen : 0, -1, escapeEnd));
return r.toArray(new Text[r.size()]);
}
private static Text resolve2Text(final String text, final boolean escaped, final int from, int to, final String escapeEnd) {
if (to == -1) to = text.length();
return new Text(
text.substring(from, to),
escaped,
from < escapeEnd.length() + 2 || (!escaped && text.charAt(from - escapeEnd.length() - 1) == '\n'));
}
static String mergeTexts(final Text[] texts) {
final StringBuilder sb = new StringBuilder(2000);
for (int n=0; n < texts.length; n++)
sb.append(texts[n].getTextPlain());
return new String(sb);
}
}
final ArrayList<Text> r = new ArrayList<Text>();
boolean escaped = text.startsWith(escapeBegin);
if (escaped) r.add(new Text("", false, true));
int i, j = 0;
while ((i = text.indexOf((escaped) ? escapeEnd : escapeBegin, j)) > -1) {
r.add(resolve2Text(text, escaped, (j > 0) ? j + ((escaped) ? startLen : endLen) : 0, i, escapeEnd));
j = i;
escaped = !escaped;
}
r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + endLen : 0, -1, escapeEnd));
return r.toArray(new Text[r.size()]);
}
private static Text resolve2Text(final String text, final boolean escaped, final int from, int to, final String escapeEnd) {
if (to == -1) to = text.length();
return new Text(
text.substring(from, to),
escaped,
from < escapeEnd.length() + 2 || (!escaped && text.charAt(from - escapeEnd.length() - 1) == '\n'));
}
static String mergeTexts(final Text[] texts) {
final StringBuilder sb = new StringBuilder(2000);
for (int n=0; n < texts.length; n++) {
sb.append(texts[n].getTextPlain());
}
return new String(sb);
}
}
}

@ -8,9 +8,9 @@
//
// This file is contributed by Franz Brausze
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -32,20 +32,32 @@ import de.anomic.data.wiki.wikiParserException;
public abstract class AbstractToken implements Token {
protected String text = null;
protected String markup = null;
protected boolean parsed = false;
protected abstract void parse() throws wikiParserException;
public String getMarkup() throws wikiParserException {
if (this.text == null)
throw new IllegalArgumentException();
if (!this.parsed) parse();
return this.markup;
}
public String getText() { return this.text; }
public String toString() { try { return getMarkup(); } catch (final wikiParserException e) { return null; } }
protected String text = null;
protected String markup = null;
protected boolean parsed = false;
protected abstract void parse() throws wikiParserException;
public String getMarkup() throws wikiParserException {
if (this.text == null) {
throw new IllegalArgumentException();
}
if (!this.parsed) {
parse();
}
return this.markup;
}
public String getText() {
return this.text;
}
@Override
public String toString() {
try {
return getMarkup();
} catch (final wikiParserException e) {
return null;
}
}
}

@ -8,9 +8,9 @@
//
// This file is contributed by Franz Brausze
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -30,39 +30,45 @@ package de.anomic.data.wiki.tokens;
public class DefinitionListToken extends ListToken {
//private static final String[] blockElements = { "dl", "dt", "dd" };
//private static final String[] blockElements = { "dl", "dt", "dd" };
public DefinitionListToken() {
super(';', null, null);
}
public DefinitionListToken() {
super(';', null, null);
}
protected StringBuilder parse(final String[] t, final int depth, final StringBuilder sb) {
sb.append("<dl>\n");
while (super.aktline < t.length && getGrade(t[super.aktline]) >= depth) {
for (int j=0; j<depth + 1; j++) sb.append("\t");
sb.append("<dt>");
@Override
protected StringBuilder parse(final String[] t, final int depth, final StringBuilder sb) {
sb.append("<dl>\n");
while (super.aktline < t.length && getGrade(t[super.aktline]) >= depth) {
for (int j=0; j<depth + 1; j++) sb.append("\t");
sb.append("<dt>");
if (getGrade(t[super.aktline]) > depth) {
parse(t, depth + 1, sb);
} else {
sb.append(t[super.aktline].substring(depth + 1).replaceFirst(":", "</dt><dd>"));
}
if (getGrade(t[super.aktline]) > depth) {
parse(t, depth + 1, sb);
} else {
sb.append(t[super.aktline].substring(depth + 1).replaceFirst(":", "</dt><dd>"));
}
sb.append("</");
if (t[super.aktline].indexOf(':') == -1 || getGrade(t[super.aktline]) > depth)
sb.append("dt");
else
sb.append("dd");
sb.append(">\n");
super.aktline++;
}
for (int j=0; j<depth; j++) sb.append("\t");
sb.append("</dl>");
super.aktline--;
return sb;
}
sb.append("</");
if (t[super.aktline].indexOf(':') == -1 || getGrade(t[super.aktline]) > depth) {
sb.append("dt");
} else {
sb.append("dd");
}
sb.append(">\n");
super.aktline++;
}
for (int j=0; j<depth; j++) {
sb.append("\t");
}
sb.append("</dl>");
super.aktline--;
return sb;
}
@Override
public String[] getBlockElementNames() {
return blockElements;
}
public String[] getBlockElementNames() {
return blockElements;
}
}

@ -8,9 +8,9 @@
//
// This file is contributed by Franz Brausze
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -41,96 +41,102 @@ import de.anomic.search.Switchboard;
public class LinkToken extends AbstractToken {
private static final int IMG = 0;
private static final int IMG = 0;
private static final int BKM = 1;
private static final int INT = 2;
private static final int EXT = 3;
private static final Pattern imgPattern = Pattern.compile(
"\\[\\[" + // begin
"(Image:([^\\]|]|\\][^\\]])*)" + // "Image:" + URL
"(" + // <optional>
"(\\|(bottom|left|center|right|middle|top))?" + // optional align
"(\\|(([^\\]]|\\][^\\]])*))" + // description
")?" + // </optional>
"\\]\\]"); // end
private static final int INT = 2;
private static final int EXT = 3;
private static final Pattern imgPattern = Pattern.compile(
"\\[\\[" + // begin
"(Image:([^\\]|]|\\][^\\]])*)" + // "Image:" + URL
"(" + // <optional>
"(\\|(bottom|left|center|right|middle|top))?" + // optional align
"(\\|(([^\\]]|\\][^\\]])*))" + // description
")?" + // </optional>
"\\]\\]"); // end
private static final Pattern bkmPattern = Pattern.compile(
"\\[\\[" + // begin
"(Bookmark:([^\\]|]|\\][^\\]])*)" + // "Bookmark:" + URL
"(\\|(([^\\]]|\\][^\\]])*?))?" + // optional description
"\\]\\]"); // end
private static final Pattern intPattern = Pattern.compile(
"\\[\\[" + // begin
"(([^\\]|]|\\][^\\]])*?)" + // wiki-page
"(\\|(([^\\]]|\\][^\\]])*?))?" + // optional desciption
"\\]\\]"); // end
private static final Pattern extPattern = Pattern.compile(
"\\[" + // begin
"([^\\] ]*)" + // URL
"( ([^\\]]*))?" + // optional description
"\\]"); // end
private static final Pattern[] patterns = new Pattern[] {
imgPattern, bkmPattern, intPattern, extPattern };
private final String localhost;
private final String wikiPath;
"\\[\\[" + // begin
"(Bookmark:([^\\]|]|\\][^\\]])*)" + // "Bookmark:" + URL
"(\\|(([^\\]]|\\][^\\]])*?))?" + // optional description
"\\]\\]"); // end
private static final Pattern intPattern = Pattern.compile(
"\\[\\[" + // begin
"(([^\\]|]|\\][^\\]])*?)" + // wiki-page
"(\\|(([^\\]]|\\][^\\]])*?))?" + // optional desciption
"\\]\\]"); // end
private static final Pattern extPattern = Pattern.compile(
"\\[" + // begin
"([^\\] ]*)" + // URL
"( ([^\\]]*))?" + // optional description
"\\]"); // end
private static final Pattern[] patterns = new Pattern[] { imgPattern, bkmPattern, intPattern, extPattern };
private final String localhost;
private final String wikiPath;
private final Switchboard sb;
private int patternNr = 0;
private int patternNr = 0;
public LinkToken(final String localhost, final String wikiPath, final Switchboard sb) {
this.localhost = localhost;
this.wikiPath = wikiPath;
public LinkToken(final String localhost, final String wikiPath, final Switchboard sb) {
this.localhost = localhost;
this.wikiPath = wikiPath;
this.sb = sb;
}
}
protected void parse() throws wikiParserException {
final StringBuilder stringBuilder = new StringBuilder();
protected void parse() throws wikiParserException {
final StringBuilder sb = new StringBuilder();
if (this.patternNr < 0 || this.patternNr >= patterns.length)
if (this.patternNr < 0 || this.patternNr >= patterns.length) {
throw new wikiParserException("patternNr was not set correctly: " + this.patternNr);
final Matcher m = patterns[this.patternNr].matcher(this.text);
if (!m.find())
}
final Matcher m = patterns[this.patternNr].matcher(this.text);
if (!m.find()) {
throw new wikiParserException("Didn't find match for: (" + this.patternNr + ") " + this.text);
}
switch (this.patternNr) {
case IMG:
sb.append("<img src=\"").append(formatHref(m.group(1).substring(6))).append("\"");
if (m.group(5) != null) sb.append(" align=\"").append(m.group(5)).append("\"");
sb.append(" alt=\"").append((m.group(7) == null) ? formatHref(m.group(1).substring(6)) : m.group(7)).append("\"");
sb.append(" />");
break;
case IMG:
stringBuilder.append("<img src=\"").append(formatHref(m.group(1).substring(6))).append("\"");
if (m.group(5) != null) {
stringBuilder.append(" align=\"").append(m.group(5)).append("\"");
}
stringBuilder.append(" alt=\"").append((m.group(7) == null) ? formatHref(m.group(1).substring(6)) : m.group(7)).append("\"");
stringBuilder.append(" />");
break;
case BKM:
final Link[] links = getLinksFromBookmarkTag(m.group(2));
if (links == null) {
sb.append("<span class=\"error\">Couldn't find Bookmark-Tag '").append(m.group(2)).append("'.</span>");
stringBuilder.append("<span class=\"error\">Couldn't find Bookmark-Tag '").append(m.group(2)).append("'.</span>");
} else {
appendLinks(links, sb);
appendLinks(links, stringBuilder);
}
break;
case INT:
sb.append(new Link(
"http://" + this.localhost + "/" + this.wikiPath + m.group(1),
m.group(4),
(m.group(4) == null) ? m.group(1) : m.group(4)
).toString());
break;
case EXT:
sb.append(new Link(
m.group(1),
m.group(3),
(m.group(3) == null) ? m.group(1) : m.group(3)
).toString());
break;
}
this.parsed = true;
this.markup = new String(sb);
}
case INT:
stringBuilder.append(new Link(
"http://" + this.localhost + "/" + this.wikiPath + m.group(1),
m.group(4),
(m.group(4) == null) ? m.group(1) : m.group(4)
).toString());
break;
case EXT:
stringBuilder.append(new Link(
m.group(1),
m.group(3),
(m.group(3) == null) ? m.group(1) : m.group(3)
).toString());
break;
}
this.parsed = true;
this.markup = new String(stringBuilder);
}
private String formatHref(final String link) {
if (link.indexOf("://") == -1) { // DATA/HTDOCS-link
@ -171,25 +177,35 @@ public class LinkToken extends AbstractToken {
this.desc = desc;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append("<a href=\"").append(this.href).append("\"");
if (this.title != null) sb.append(" title=\"").append(this.title).append("\"");
sb.append(">");
if (this.desc == null) sb.append(this.href); else sb.append(this.desc);
sb.append("</a>");
return new String(sb);
final StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("<a href=\"").append(this.href).append("\"");
if (this.title != null) stringBuilder.append(" title=\"").append(this.title).append("\"");
stringBuilder.append(">");
if (this.desc == null) stringBuilder.append(this.href); else stringBuilder.append(this.desc);
stringBuilder.append("</a>");
return new String(stringBuilder);
}
}
public String[] getBlockElementNames() { return null; }
public Pattern[] getRegex() { return patterns; }
public String[] getBlockElementNames() {
return null;
}
public Pattern[] getRegex() {
return patterns;
}
public boolean setText(final String text, final int patternNr) {
this.text = text;
this.patternNr = patternNr;
this.parsed = false;
if (text == null) {
this.markup = null;
this.patternNr = -1;
}
return true;
}
public boolean setText(final String text, final int patternNr) {
this.text = text;
this.patternNr = patternNr;
this.parsed = false;
if (text == null) { this.markup = null; this.patternNr = -1; }
return true;
}
}

@ -8,9 +8,9 @@
//
// This file is contributed by Franz Brausze
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -33,111 +33,97 @@ import java.util.regex.Pattern;
public class ListToken extends AbstractToken {
protected final String[] blockElements;
protected final char firstChar;
protected final String listBlockElement;
protected final String listElement;
protected final boolean recursion;
protected final Pattern[] pattern;
protected int aktline = 0;
public ListToken(final char firstChar, final String listBlockElement) {
this.firstChar = firstChar;
this.listBlockElement = listBlockElement;
this.listElement = "li";
this.recursion = true;
this.pattern = new Pattern[] { Pattern.compile("^[" + firstChar + "]([^\n]|\n[" + firstChar + "])*", Pattern.MULTILINE) };
final ArrayList<String> r = new ArrayList<String>();
if (this.listBlockElement != null) {
if (this.recursion) r.add(this.listBlockElement);
if (this.listElement != null) r.add(this.listElement);
}
blockElements = r.toArray(new String[r.size()]);
}
public ListToken(final char firstChar, final String listBlockElement, final String listElement) {
this.firstChar = firstChar;
this.listBlockElement = listBlockElement;
this.listElement = listElement;
this.recursion = true;
this.pattern = new Pattern[] { Pattern.compile("^[" + firstChar + "]([^\n]|\n[" + firstChar + "])*", Pattern.MULTILINE) };
final ArrayList<String> r = new ArrayList<String>();
if (this.listBlockElement != null) {
if (this.recursion) r.add(this.listBlockElement);
if (this.listElement != null) r.add(this.listElement);
}
blockElements = r.toArray(new String[r.size()]);
}
public ListToken(final char firstChar, final String listBlockElement, final String listElement, final boolean recursion) {
this.firstChar = firstChar;
this.listBlockElement = listBlockElement;
this.listElement = listElement;
this.recursion = recursion;
this.pattern = new Pattern[] { Pattern.compile("^[" + firstChar + "]([^\n]|\n[" + firstChar + "])*", Pattern.MULTILINE) };
final ArrayList<String> r = new ArrayList<String>();
if (this.listBlockElement != null) {
if (this.recursion) r.add(this.listBlockElement);
if (this.listElement != null) r.add(this.listElement);
}
blockElements = r.toArray(new String[r.size()]);
}
protected void parse() {
final StringBuilder sb = new StringBuilder(this.text.length());
parse(this.text.split("\n"), 0, sb);
this.markup = new String(sb);
this.parsed = true;
}
protected StringBuilder parse(final String[] t, final int depth, final StringBuilder sb) {
if (this.listBlockElement != null) sb.append("<").append(this.listBlockElement).append(">\n");
while (this.aktline < t.length && getGrade(t[this.aktline]) >= depth) {
if (recursion) for (int j=0; j<depth + 1; j++) sb.append("\t");
if (this.listElement != null) sb.append("<").append(this.listElement).append(">");
if (this.recursion && getGrade(t[this.aktline]) > depth) {
parse(t, depth + 1, sb);
} else {
sb.append(t[this.aktline].substring(depth + 1));
}
if (this.listElement != null) sb.append("</").append(this.listElement).append(">");
sb.append("\n");
this.aktline++;
}
if (this.recursion) for (int j=0; j<depth; j++) sb.append("\t");
if (this.listBlockElement != null) sb.append("</").append(this.listBlockElement).append(">");
this.aktline--;
return sb;
}
protected int getGrade(final String t) {
int i = 0;
for (i=0; i<t.length(); i++)
if (t.charAt(i) != this.firstChar) break;
return i - 1;
}
public String[] getBlockElementNames() {
return blockElements;
}
public Pattern[] getRegex() {
return this.pattern;
}
public char getFirstChar() {
return this.firstChar;
}
public boolean setText(final String text, final int patternNr) {
this.text = text;
this.markup = null;
this.parsed = false;
this.aktline = 0;
return true;
}
protected final String[] blockElements;
protected final char firstChar;
protected final String listBlockElement;
protected final String listElement;
protected final boolean recursion;
protected final Pattern[] pattern;
protected int aktline = 0;
public ListToken(final char firstChar, final String listBlockElement) {
this(firstChar, listBlockElement, "li");
}
public ListToken(final char firstChar, final String listBlockElement, final String listElement) {
this(firstChar, listBlockElement, listElement, true);
}
public ListToken(final char firstChar, final String listBlockElement, final String listElement, final boolean recursion) {
this.firstChar = firstChar;
this.listBlockElement = listBlockElement;
this.listElement = listElement;
this.recursion = recursion;
this.pattern = new Pattern[] { Pattern.compile("^[" + firstChar + "]([^\n]|\n[" + firstChar + "])*", Pattern.MULTILINE) };
final ArrayList<String> r = new ArrayList<String>();
if (this.listBlockElement != null) {
if (this.recursion) {
r.add(this.listBlockElement);
}
if (this.listElement != null) {
r.add(this.listElement);
}
}
blockElements = r.toArray(new String[r.size()]);
}
protected void parse() {
final StringBuilder sb = new StringBuilder(this.text.length());
parse(this.text.split("\n"), 0, sb);
this.markup = new String(sb);
this.parsed = true;
}
protected StringBuilder parse(final String[] t, final int depth, final StringBuilder sb) {
if (this.listBlockElement != null) sb.append("<").append(this.listBlockElement).append(">\n");
while (this.aktline < t.length && getGrade(t[this.aktline]) >= depth) {
if (recursion) for (int j=0; j<depth + 1; j++) sb.append("\t");
if (this.listElement != null) sb.append("<").append(this.listElement).append(">");
if (this.recursion && getGrade(t[this.aktline]) > depth) {
parse(t, depth + 1, sb);
} else {
sb.append(t[this.aktline].substring(depth + 1));
}
if (this.listElement != null) sb.append("</").append(this.listElement).append(">");
sb.append("\n");
this.aktline++;
}
if (this.recursion) for (int j=0; j<depth; j++) sb.append("\t");
if (this.listBlockElement != null) sb.append("</").append(this.listBlockElement).append(">");
this.aktline--;
return sb;
}
protected int getGrade(final String t) {
int i = 0;
for (i=0; i<t.length(); i++) {
if (t.charAt(i) != this.firstChar) break;
}
return i - 1;
}
public String[] getBlockElementNames() {
return blockElements;
}
public Pattern[] getRegex() {
return this.pattern;
}
public char getFirstChar() {
return this.firstChar;
}
public boolean setText(final String text, final int patternNr) {
this.text = text;
this.markup = null;
this.parsed = false;
this.aktline = 0;
return true;
}
}

@ -8,9 +8,9 @@
//
// This file is contributed by Franz Brausze
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -36,106 +36,117 @@ import de.anomic.data.wiki.wikiParserException;
public class SimpleToken extends AbstractToken {
protected String content = null;
protected int grade = 0;
protected final Pattern[] pattern;
private final String[][] definitionList;
private final String[] blockElements;
public SimpleToken(final char firstChar, final char lastChar, final String[][] definitionList, final boolean isBlockElements) {
this.definitionList = definitionList;
int i;
if (isBlockElements) {
final ArrayList<String> r = new ArrayList<String>();
int j;
for (i = 0; i < definitionList.length; i++)
if (definitionList[i] != null)
for (j = 0; j < definitionList[i].length; j++)
r.add(definitionList[i][j]);
this.blockElements = r.toArray(new String[r.size()]);
} else {
this.blockElements = null;
}
for (i=0; i<definitionList.length; i++)
if (definitionList[i] != null) {
i++;
break;
}
this.pattern = new Pattern[] { Pattern.compile(
"([\\" + firstChar + "]{" + i + "," + definitionList.length + "})" +
protected String content = null;
protected int grade = 0;
protected final Pattern[] pattern;
private final String[][] definitionList;
private final String[] blockElements;
public SimpleToken(final char firstChar, final char lastChar, final String[][] definitionList, final boolean isBlockElements) {
this.definitionList = definitionList;
int i;
if (isBlockElements) {
final ArrayList<String> r = new ArrayList<String>();
int j;
for (i = 0; i < definitionList.length; i++)
if (definitionList[i] != null)
for (j = 0; j < definitionList[i].length; j++)
r.add(definitionList[i][j]);
this.blockElements = r.toArray(new String[r.size()]);
} else {
this.blockElements = null;
}
for (i=0; i<definitionList.length; i++) {
if (definitionList[i] != null) {
i++;
break;
}
}
this.pattern = new Pattern[] {
Pattern.compile(
"([\\" + firstChar + "]{" + i + "," + definitionList.length + "})" +
"(.*?)" +
"([\\" + lastChar + "]{" + i + "," + definitionList.length + "})")};
}
public String getMarkup() throws wikiParserException {
if (this.content == null) {
if (this.text == null) {
throw new IllegalArgumentException();
}
setText(this.text, 0);
}
if (!this.parsed) parse();
return this.markup;
}
protected void parse() throws wikiParserException {
String[] e;
if (this.grade >= this.definitionList.length || (e = this.definitionList[this.grade]) == null)
throw new wikiParserException("Token not defined for grade: " + this.grade);
this.markup = getMarkup(e);
this.parsed = true;
}
protected String getMarkup(final String[] es) {
return getMarkup(es, false) + this.content + getMarkup(es, true);
}
protected String getMarkup(final String[] es, final boolean closing) {
final StringBuilder result = new StringBuilder();
// backwards if closing
for (
int i = (closing) ? es.length - 1 : 0, j;
(closing && i >= 0) ^ (!closing && i < es.length);
i += (closing) ? -1 : +1
) {
result.append("<");
if (closing) {
result.append("/");
if ((j = es[i].indexOf(' ')) > -1) {
result.append(es[i].substring(0, j));
} else {
result.append(es[i]);
}
} else {
result.append(es[i]);
}
result.append(">");
}
return new String(result);
}
public boolean setText(final String text, final int patternNr) {
this.text = text;
this.markup = null;
this.parsed = false;
if (text != null) {
final Matcher m = getRegex()[0].matcher(text);
if (
(m.matches()) &&
(m.group(1).length() == m.group(3).length()) &&
(definitionList.length >= m.group(1).length()) &&
(definitionList[m.group(1).length() - 1] != null)
) {
this.grade = m.group(1).length() - 1;
this.content = m.group(2);
return true;
}
}
return false;
}
public Pattern[] getRegex() { return this.pattern; }
public String[] getBlockElementNames() { return this.blockElements; }
"([\\" + lastChar + "]{" + i + "," + definitionList.length + "})")
};
}
@Override
public String getMarkup() throws wikiParserException {
if (this.content == null) {
if (this.text == null) {
throw new IllegalArgumentException();
}
setText(this.text, 0);
}
if (!this.parsed) parse();
return this.markup;
}
protected void parse() throws wikiParserException {
String[] e;
if (this.grade >= this.definitionList.length || (e = this.definitionList[this.grade]) == null)
throw new wikiParserException("Token not defined for grade: " + this.grade);
this.markup = getMarkup(e);
this.parsed = true;
}
protected String getMarkup(final String[] es) {
return getMarkup(es, false) + this.content + getMarkup(es, true);
}
protected String getMarkup(final String[] es, final boolean closing) {
final StringBuilder result = new StringBuilder();
// backwards if closing
for (
int i = (closing) ? es.length - 1 : 0, j;
(closing && i >= 0) ^ (!closing && i < es.length);
i += (closing) ? -1 : +1
) {
result.append("<");
if (closing) {
result.append("/");
if ((j = es[i].indexOf(' ')) > -1) {
result.append(es[i].substring(0, j));
} else {
result.append(es[i]);
}
} else {
result.append(es[i]);
}
result.append(">");
}
return new String(result);
}
public boolean setText(final String text, final int patternNr) {
this.text = text;
this.markup = null;
this.parsed = false;
if (text != null) {
final Matcher m = getRegex()[0].matcher(text);
if (
(m.matches()) &&
(m.group(1).length() == m.group(3).length()) &&
(definitionList.length >= m.group(1).length()) &&
(definitionList[m.group(1).length() - 1] != null)
) {
this.grade = m.group(1).length() - 1;
this.content = m.group(2);
return true;
}
}
return false;
}
public Pattern[] getRegex() {
return this.pattern;
}
public String[] getBlockElementNames() {
return this.blockElements;
}
}

@ -8,9 +8,9 @@
//
// This file is contributed by Franz Brausze
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -34,46 +34,49 @@ import java.util.regex.Pattern;
public class TableToken extends AbstractToken {
private static final Pattern[] pattern = new Pattern[] {
Pattern.compile(
"\\{\\|" + // "{|"
"([^\n]|\n\\|[|-])*\n" + // new line must start with "||" or "|-"
"\\|\\}") // "|}"
};
private static final String[] blockElementNames = new String[] { "table", "tr", "td" };
private static final Pattern[] pattern = new Pattern[] {
Pattern.compile(
"\\{\\|" + // "{|"
"([^\n]|\n\\|[|-])*\n" + // new line must start with "||" or "|-"
"\\|\\}") // "|}"
};
protected void parse() {
final String[] t = text.split("\n");
String[] tds;
final StringBuilder sb = new StringBuilder();
sb.append("<table");
if (t[0].length() > 2) sb.append(parseTableProperties(t[0].substring(2)));
sb.append(">\n");
boolean trOpen = false;
for (int i=1, j, a; i<t.length-1; i++) {
if (t[i].startsWith("|-")) {
if (trOpen) sb.append("\t</tr>\n");
trOpen = (i < t.length - 2);
if (trOpen) sb.append("\t<tr>\n");
} else if (t[i].startsWith("||")) {
tds = t[i].split("\\|\\|");
for (j=0; j<tds.length; j++) {
if (tds[j].length() > (a = tds[j].indexOf('|')) + 1) { // don't print empty td's
sb.append("\t\t<td");
if (a > -1) sb.append(parseTableProperties(tds[j].substring(0, a)));
sb.append(">").append(tds[j].substring(a + 1)).append("</td>\n");
}
}
}
}
if (trOpen) sb.append("\t</tr>\n");
this.markup = new String(sb.append("</table>"));
this.parsed = true;
}
private static final String[] blockElementNames = new String[] { "table", "tr", "td" };
protected void parse() {
final String[] t = text.split("\n");
String[] tds;
final StringBuilder sb = new StringBuilder();
sb.append("<table");
if (t[0].length() > 2) sb.append(parseTableProperties(t[0].substring(2)));
sb.append(">\n");
boolean trOpen = false;
for (int i=1, j, a; i<t.length-1; i++) {
if (t[i].startsWith("|-")) {
if (trOpen) sb.append("\t</tr>\n");
trOpen = (i < t.length - 2);
if (trOpen) sb.append("\t<tr>\n");
} else if (t[i].startsWith("||")) {
tds = t[i].split("\\|\\|");
for (j=0; j<tds.length; j++) {
if (tds[j].length() > (a = tds[j].indexOf('|')) + 1) { // don't print empty td's
sb.append("\t\t<td");
if (a > -1) sb.append(parseTableProperties(tds[j].substring(0, a)));
sb.append(">").append(tds[j].substring(a + 1)).append("</td>\n");
}
}
}
}
if (trOpen) sb.append("\t</tr>\n");
this.markup = new String(sb.append("</table>"));
this.parsed = true;
}
// from de.anomic.data.wikiCode.java.parseTableProperties, modified by [FB]
private static final String[] tps = { "rowspan", "colspan", "vspace", "hspace", "cellspacing", "cellpadding", "border" };
private static final String[] tps = { "rowspan", "colspan", "vspace", "hspace", "cellspacing", "cellpadding", "border" };
private static final HashMap<String, String[]> ps = new HashMap<String, String[]>();
static {
Arrays.sort(tps);
String[] array;
@ -87,7 +90,7 @@ public class TableToken extends AbstractToken {
ps.put("align", array);
}
// contributed by [MN]
// contributed by [MN]
/** This method takes possible table properties and tests if they are valid.
* Valid in this case means if they are a property for the table, tr or td
* tag as stated in the HTML Pocket Reference by Jennifer Niederst (1st edition)
@ -102,21 +105,21 @@ public class TableToken extends AbstractToken {
String[] posVals;
final int numberofvalues = values.length;
for (int i=0; i<numberofvalues; i++) {
key = values[i].trim();
key = values[i].trim();
if (key.equals("nowrap")) {
addPair("nowrap", "nowrap", sb);
} else if (i + 1 < numberofvalues) {
value = values[++i].trim();
if (
(key.equals("summary")) ||
(key.equals("bgcolor") && value.matches("#{0,1}[0-9a-fA-F]{1,6}|[a-zA-Z]{3,}")) ||
((key.equals("width") || key.equals("height")) && value.matches("\\d+%{0,1}")) ||
((posVals = ps.get(key)) != null && Arrays.binarySearch(posVals, value) >= 0) ||
(Arrays.binarySearch(tps, key) >= 0 && value.matches("\\d+"))
) {
addPair(key, value, sb);
}
}
value = values[++i].trim();
if (
(key.equals("summary")) ||
(key.equals("bgcolor") && value.matches("#{0,1}[0-9a-fA-F]{1,6}|[a-zA-Z]{3,}")) ||
((key.equals("width") || key.equals("height")) && value.matches("\\d+%{0,1}")) ||
((posVals = ps.get(key)) != null && Arrays.binarySearch(posVals, value) >= 0) ||
(Arrays.binarySearch(tps, key) >= 0 && value.matches("\\d+"))
) {
addPair(key, value, sb);
}
}
}
return sb;
}
@ -125,13 +128,19 @@ public class TableToken extends AbstractToken {
return sb.append(" ").append(key).append("=\"").append(value).append("\"");
}
public Pattern[] getRegex() { return pattern; }
public String[] getBlockElementNames() { return blockElementNames; }
public Pattern[] getRegex() {
return pattern;
}
public String[] getBlockElementNames() {
return blockElementNames;
}
public boolean setText(final String text, final int patternNr) {
this.text = text;
this.parsed = false;
this.markup = null;
return true;
}
public boolean setText(final String text, final int patternNr) {
this.text = text;
this.parsed = false;
this.markup = null;
return true;
}
}

@ -8,9 +8,9 @@
//
// This file is contributed by Franz Brausze
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -34,9 +34,9 @@ import de.anomic.data.wiki.wikiParserException;
public interface Token {
public Pattern[] getRegex();
public boolean setText(String text, int patternNr);
public String getText();
public String getMarkup() throws wikiParserException;
public String[] getBlockElementNames();
public Pattern[] getRegex();
public boolean setText(String text, int patternNr);
public String getText();
public String getMarkup() throws wikiParserException;
public String[] getBlockElementNames();
}

@ -3,18 +3,21 @@
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//last major change: 20.07.2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@ -41,7 +44,7 @@ public class wikiBoard {
public static final int keyLength = 64;
private static final String dateFormat = "yyyyMMddHHmmss";
static SimpleDateFormat SimpleFormatter = new SimpleDateFormat(dateFormat);
private static final SimpleDateFormat SimpleFormatter = new SimpleDateFormat(dateFormat);
static {
SimpleFormatter.setTimeZone(TimeZone.getTimeZone("GMT"));

@ -1,3 +1,28 @@
// wikiParser.java
// ---------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2007
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data.wiki;
import java.io.UnsupportedEncodingException;

@ -1,3 +1,28 @@
// wikiParserException.java
// ---------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2007
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data.wiki;
public class wikiParserException extends Exception {

Loading…
Cancel
Save