- extended experimental wikipedia dump parser

- removed historic, possibly unused code from wiki parser that was in conflict with actual wikipedia wiki code git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5790 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · d4d87d90c4
parent c3aff2521e
commit d4d87d90c4
2 changed files with 22 additions and 90 deletions
--- a/source/de/anomic/data/wiki/wikiCode.java
+++ b/source/de/anomic/data/wiki/wikiCode.java
@ -64,7 +64,6 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
    private boolean defList = false;            //needed for definition lists
    private boolean escape = false;             //needed for escape
    private boolean escaped = false;            //needed for <pre> not getting in the way
    private boolean escapeSpan = false;         //needed for escape symbols [= and =] spanning over several lines
    private boolean newrowstart=false;          //needed for the first row not to be empty
    private boolean nolist = false;             //needed for handling of [= and <pre> in lists
    private boolean preformatted = false;       //needed for preformatted text
@ -72,7 +71,6 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
    private boolean replacedHTML = false;       //indicates if method replaceHTML has been used with line already
    private boolean table = false;              //needed for tables, because they reach over several lines
    private int preindented = 0;                //needed for indented <pre>s
    private int escindented = 0;                //needed for indented [=s
    private int headlines = 0;                  //number of headlines in page
    private final ArrayList<String> dirElements = new ArrayList<String>();    //list of headlines used to create diectory of page
@ -473,78 +471,6 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
        return result;
    }
    /** This method handles the escape tags [= =] */
    //contributed by [MN]
    private String escapeTag(String result){
        int p0 = 0;
        int p1 = 0;
        //both [= and =] in the same line
        if(((p0 = result.indexOf("[="))>=0)&&((p1 = result.indexOf("=]"))>0)&&(!(preformatted))){
            if(p0<p1){
                String escapeText = result.substring(p0+2,p1);
                escapeText = escapeText.replaceAll("!esc!", "!esc!!");
                result = transformLine(result.substring(0,p0).replaceAll("!esc!", "!esc!!")+"!esc!txt!"+result.substring(p1+2).replaceAll("!esc!", "!esc!!"));
                result = result.replaceAll("!esc!txt!", escapeText);
                result = result.replaceAll("!esc!!", "!esc!");
            }
            //handles cases like [=[= =]=] [= =] that would cause an exception otherwise
            else{
                escape = true;
                final String temp1 = transformLine(result.substring(0,p0-1).replaceAll("!tmp!","!tmp!!")+"!tmp!txt!");
                nolist = true;
                final String temp2 = transformLine(result.substring(p0));
                nolist = false;
                result = temp1.replaceAll("!tmp!txt!",temp2);
                result = result.replaceAll("!tmp!!", "!tmp!");
                escape = false;
            }
        }
        //start [=
        else if(((p0 = result.indexOf("[="))>=0)&&(!escapeSpan)&&(!preformatted)){
            escape = true;    //prevent surplus line breaks
            escaped = true;   //prevents <pre> being parsed
            String bq = "";   //gets filled with <blockquote>s as needed
            String escapeText = result.substring(p0+2);
            escapeText = escapeText.replaceAll("!esc!", "!esc!!");
            //taking care of indented lines
            while(result.substring(escindented,p0).startsWith(":")){
                escindented++;
                bq = bq + "<blockquote>";
            }
            result = transformLine(result.substring(escindented,p0).replaceAll("!esc!", "!esc!!")+"!esc!txt!");
            result = bq + result.replaceAll("!esc!txt!", escapeText);
            result = result.replaceAll("!esc!!", "!esc!");
            escape = false;
            escapeSpan = true;
        }
        //end =]
        else if(((p0 = result.indexOf("=]"))>=0)&&(escapeSpan)&&(!preformatted)){
            escapeSpan = false;
            String bq = ""; //gets filled with </blockquote>s as needed
            String escapeText = result.substring(0,p0);
            escapeText = escapeText.replaceAll("!esc!", "!esc!!");
            //taking care of indented lines
            while(escindented > 0){
                bq = bq + "</blockquote>";
                escindented--;
            }
            result = transformLine("!esc!txt!"+result.substring(p0+2).replaceAll("!esc!", "!esc!!"));
            result = result.replaceAll("!esc!txt!", escapeText) + bq;
            result = result.replaceAll("!esc!!", "!esc!");
            escaped = false;
        }
        //Getting rid of surplus =]
        else if (((p0 = result.indexOf("=]"))>=0)&&(!escapeSpan)&&(!preformatted)){
            while((p0 = result.indexOf("=]"))>=0){
                result = result.substring(0,p0)+result.substring(p0+2);
            }
            result = transformLine(result);
        }
        return result;
    }
    /** This method handles the preformatted tags <pre> </pre> */
    //contributed by [MN]
    private String preformattedTag(String result){
@ -757,18 +683,10 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
            replacedHTML = true;
        }
        //check if line contains escape symbols([= =]) or if we are in an escape sequence already.
        if ((result.indexOf("[=")>=0)||(result.indexOf("=]")>=0)||(escapeSpan)){
            result = escapeTag(result);
        }
        //check if line contains preformatted symbols or if we are in a preformatted sequence already.
-        else if ((result.indexOf("&lt;pre&gt;")>=0)||(result.indexOf("&lt;/pre&gt;")>=0)||(preformattedSpan)){
+        if ((result.indexOf("&lt;pre&gt;")>=0)||(result.indexOf("&lt;/pre&gt;")>=0)||(preformattedSpan)){
            result = preformattedTag(result);
-        }
+        } else {
        //transform page as usual
        else {
            //tables first -> wiki-tags in cells can be treated after that
            result = processTable(result);
--- a/source/de/anomic/tools/mediawikiIndex.java
+++ b/source/de/anomic/tools/mediawikiIndex.java
@ -50,6 +50,8 @@ import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import de.anomic.data.wiki.wikiCode;
 import de.anomic.data.wiki.wikiParser;
 import de.anomic.kelondro.util.ByteBuffer;
 /*
@ -59,6 +61,8 @@ import de.anomic.kelondro.util.ByteBuffer;
 public class mediawikiIndex {
    private static final String textstart = "<text";
    private static final String textend = "</text>";
    private static final String pagestart = "<page>";
    private static final String pageend = "</page>";
    private static final byte[] pagestartb = pagestart.getBytes();
@ -388,28 +392,38 @@ public class mediawikiIndex {
                BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is));
                String t;
                StringBuffer sb = new StringBuffer();
-                boolean read = false;
+                boolean page = false, text = false;
                String title = null;
                wikiParser wparser = new wikiCode("de.wikipedia.org");
                //plasmaParser hparser = new plasmaParser();
                while ((t = r.readLine()) != null) {
                    if (t.indexOf(pagestart) >= 0) {
-                        read = true;
+                        page = true;
                        continue;
                    }
-                    if (t.indexOf(pageend) >= 0) {
+                    if (t.indexOf(textstart) >= 0) {
-                        read = false;
+                        text = page;
                        continue;
                    }
                    if (t.indexOf(textend) >= 0) {
                        text = false;
                        System.out.println("Title: " + title);
-                        System.out.println(sb);
+                        System.out.println(wparser.transform(sb.toString()));
                        System.out.println();
                        sb.setLength(0);
                        continue;
                    }
                    if (t.indexOf(pageend) >= 0) {
                        page = false;
                        continue;
                    }
                    if (t.indexOf("<title>") >= 0) {
                        title = t.substring(t.indexOf("<title>") + 7);
                        int p = title.indexOf("</title>");
                        if (p >= 0) title = title.substring(0, p);
                        continue;
                    }
-                    if (read) {
+                    if (text) {
                        sb.append(t);
                        sb.append('\n');
                    }