- extended experimental wikipedia dump parser

- removed historic, possibly unused code from wiki parser that was in conflict with actual wikipedia wiki code

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5790 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent c3aff2521e
commit d4d87d90c4

@ -64,7 +64,6 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
private boolean defList = false; //needed for definition lists private boolean defList = false; //needed for definition lists
private boolean escape = false; //needed for escape private boolean escape = false; //needed for escape
private boolean escaped = false; //needed for <pre> not getting in the way private boolean escaped = false; //needed for <pre> not getting in the way
private boolean escapeSpan = false; //needed for escape symbols [= and =] spanning over several lines
private boolean newrowstart=false; //needed for the first row not to be empty private boolean newrowstart=false; //needed for the first row not to be empty
private boolean nolist = false; //needed for handling of [= and <pre> in lists private boolean nolist = false; //needed for handling of [= and <pre> in lists
private boolean preformatted = false; //needed for preformatted text private boolean preformatted = false; //needed for preformatted text
@ -72,7 +71,6 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
private boolean replacedHTML = false; //indicates if method replaceHTML has been used with line already private boolean replacedHTML = false; //indicates if method replaceHTML has been used with line already
private boolean table = false; //needed for tables, because they reach over several lines private boolean table = false; //needed for tables, because they reach over several lines
private int preindented = 0; //needed for indented <pre>s private int preindented = 0; //needed for indented <pre>s
private int escindented = 0; //needed for indented [=s
private int headlines = 0; //number of headlines in page private int headlines = 0; //number of headlines in page
private final ArrayList<String> dirElements = new ArrayList<String>(); //list of headlines used to create diectory of page private final ArrayList<String> dirElements = new ArrayList<String>(); //list of headlines used to create diectory of page
@ -473,78 +471,6 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
return result; return result;
} }
/** This method handles the escape tags [= =] */
//contributed by [MN]
private String escapeTag(String result){
int p0 = 0;
int p1 = 0;
//both [= and =] in the same line
if(((p0 = result.indexOf("[="))>=0)&&((p1 = result.indexOf("=]"))>0)&&(!(preformatted))){
if(p0<p1){
String escapeText = result.substring(p0+2,p1);
escapeText = escapeText.replaceAll("!esc!", "!esc!!");
result = transformLine(result.substring(0,p0).replaceAll("!esc!", "!esc!!")+"!esc!txt!"+result.substring(p1+2).replaceAll("!esc!", "!esc!!"));
result = result.replaceAll("!esc!txt!", escapeText);
result = result.replaceAll("!esc!!", "!esc!");
}
//handles cases like [=[= =]=] [= =] that would cause an exception otherwise
else{
escape = true;
final String temp1 = transformLine(result.substring(0,p0-1).replaceAll("!tmp!","!tmp!!")+"!tmp!txt!");
nolist = true;
final String temp2 = transformLine(result.substring(p0));
nolist = false;
result = temp1.replaceAll("!tmp!txt!",temp2);
result = result.replaceAll("!tmp!!", "!tmp!");
escape = false;
}
}
//start [=
else if(((p0 = result.indexOf("[="))>=0)&&(!escapeSpan)&&(!preformatted)){
escape = true; //prevent surplus line breaks
escaped = true; //prevents <pre> being parsed
String bq = ""; //gets filled with <blockquote>s as needed
String escapeText = result.substring(p0+2);
escapeText = escapeText.replaceAll("!esc!", "!esc!!");
//taking care of indented lines
while(result.substring(escindented,p0).startsWith(":")){
escindented++;
bq = bq + "<blockquote>";
}
result = transformLine(result.substring(escindented,p0).replaceAll("!esc!", "!esc!!")+"!esc!txt!");
result = bq + result.replaceAll("!esc!txt!", escapeText);
result = result.replaceAll("!esc!!", "!esc!");
escape = false;
escapeSpan = true;
}
//end =]
else if(((p0 = result.indexOf("=]"))>=0)&&(escapeSpan)&&(!preformatted)){
escapeSpan = false;
String bq = ""; //gets filled with </blockquote>s as needed
String escapeText = result.substring(0,p0);
escapeText = escapeText.replaceAll("!esc!", "!esc!!");
//taking care of indented lines
while(escindented > 0){
bq = bq + "</blockquote>";
escindented--;
}
result = transformLine("!esc!txt!"+result.substring(p0+2).replaceAll("!esc!", "!esc!!"));
result = result.replaceAll("!esc!txt!", escapeText) + bq;
result = result.replaceAll("!esc!!", "!esc!");
escaped = false;
}
//Getting rid of surplus =]
else if (((p0 = result.indexOf("=]"))>=0)&&(!escapeSpan)&&(!preformatted)){
while((p0 = result.indexOf("=]"))>=0){
result = result.substring(0,p0)+result.substring(p0+2);
}
result = transformLine(result);
}
return result;
}
/** This method handles the preformatted tags <pre> </pre> */ /** This method handles the preformatted tags <pre> </pre> */
//contributed by [MN] //contributed by [MN]
private String preformattedTag(String result){ private String preformattedTag(String result){
@ -757,18 +683,10 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
replacedHTML = true; replacedHTML = true;
} }
//check if line contains escape symbols([= =]) or if we are in an escape sequence already.
if ((result.indexOf("[=")>=0)||(result.indexOf("=]")>=0)||(escapeSpan)){
result = escapeTag(result);
}
//check if line contains preformatted symbols or if we are in a preformatted sequence already. //check if line contains preformatted symbols or if we are in a preformatted sequence already.
else if ((result.indexOf("&lt;pre&gt;")>=0)||(result.indexOf("&lt;/pre&gt;")>=0)||(preformattedSpan)){ if ((result.indexOf("&lt;pre&gt;")>=0)||(result.indexOf("&lt;/pre&gt;")>=0)||(preformattedSpan)){
result = preformattedTag(result); result = preformattedTag(result);
} } else {
//transform page as usual
else {
//tables first -> wiki-tags in cells can be treated after that //tables first -> wiki-tags in cells can be treated after that
result = processTable(result); result = processTable(result);

@ -50,6 +50,8 @@ import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.kelondro.util.ByteBuffer; import de.anomic.kelondro.util.ByteBuffer;
/* /*
@ -59,6 +61,8 @@ import de.anomic.kelondro.util.ByteBuffer;
public class mediawikiIndex { public class mediawikiIndex {
private static final String textstart = "<text";
private static final String textend = "</text>";
private static final String pagestart = "<page>"; private static final String pagestart = "<page>";
private static final String pageend = "</page>"; private static final String pageend = "</page>";
private static final byte[] pagestartb = pagestart.getBytes(); private static final byte[] pagestartb = pagestart.getBytes();
@ -388,28 +392,38 @@ public class mediawikiIndex {
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is)); BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is));
String t; String t;
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
boolean read = false; boolean page = false, text = false;
String title = null; String title = null;
wikiParser wparser = new wikiCode("de.wikipedia.org");
//plasmaParser hparser = new plasmaParser();
while ((t = r.readLine()) != null) { while ((t = r.readLine()) != null) {
if (t.indexOf(pagestart) >= 0) { if (t.indexOf(pagestart) >= 0) {
read = true; page = true;
continue; continue;
} }
if (t.indexOf(pageend) >= 0) { if (t.indexOf(textstart) >= 0) {
read = false; text = page;
continue;
}
if (t.indexOf(textend) >= 0) {
text = false;
System.out.println("Title: " + title); System.out.println("Title: " + title);
System.out.println(sb); System.out.println(wparser.transform(sb.toString()));
System.out.println(); System.out.println();
sb.setLength(0); sb.setLength(0);
continue; continue;
} }
if (t.indexOf(pageend) >= 0) {
page = false;
continue;
}
if (t.indexOf("<title>") >= 0) { if (t.indexOf("<title>") >= 0) {
title = t.substring(t.indexOf("<title>") + 7); title = t.substring(t.indexOf("<title>") + 7);
int p = title.indexOf("</title>"); int p = title.indexOf("</title>");
if (p >= 0) title = title.substring(0, p); if (p >= 0) title = title.substring(0, p);
continue; continue;
} }
if (read) { if (text) {
sb.append(t); sb.append(t);
sb.append('\n'); sb.append('\n');
} }

Loading…
Cancel
Save