fix for mediawiki importer and wikicode parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7651 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 01690eab86
parent c5352e6872
commit 01690eab86
2 changed files with 30 additions and 25 deletions
--- a/source/de/anomic/data/wiki/WikiCode.java
+++ b/source/de/anomic/data/wiki/WikiCode.java
@ -31,6 +31,7 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Pattern;
 import net.yacy.document.parser.html.CharacterCoding;
@ -45,7 +46,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
    private static final String EMPTY = "";
    private static final String PIPE_ESCAPED = "&#124;";
-    private static final String REGEX_NOT_CHAR_NUM_OR_UNDERSCORE = "[^a-zA-Z0-9_]";
+    private static final Pattern REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN = Pattern.compile("[^a-zA-Z0-9_]");
    private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
    private static enum Tags {
        HEADLINE_1("=", "<h1>", "</h1>"),
@ -131,8 +133,6 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
    private static final int LEN_WIKI_HR_LINE = WIKI_HR_LINE.length();
    private static final int LEN_PIPE_ESCAPED = PIPE_ESCAPED.length();
    private final TableOfContent tableOfContent = new TableOfContent();
    /** List of properties which can be used in tables. */
    private final static String[] TABLE_PROPERTIES = {"rowspan", "colspan", "vspace", "hspace", "cellspacing", "cellpadding", "border"};
@ -150,21 +150,6 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
    private final static char[] HEADLINE_LEVEL = new char[]{ONE, TWO, THREE, FOUR, FIVE, SIX};
    private String orderedListLevel = EMPTY;
    private String unorderedListLevel = EMPTY;
    private String defListLevel = EMPTY;
    private boolean processingCell = false;             //needed for prevention of double-execution of replaceHTML
    private boolean processingDefList = false;          //needed for definition lists
    private boolean escape = false;                     //needed for escape
    private boolean escaped = false;                    //needed for <pre> not getting in the way
    private boolean newRowStart = false;                //needed for the first row not to be empty
    private boolean noList = false;                     //needed for handling of [= and <pre> in lists
    private boolean processingPreformattedText = false; //needed for preformatted text
    private boolean preformattedSpanning = false;       //needed for <pre> and </pre> spanning over several lines
    private boolean replacedHtmlAlready = false;        //indicates if method replaceHTML has been used with line already
    private boolean processingTable = false;            //needed for tables, because they reach over several lines
    private int preindented = 0;                        //needed for indented <pre>s
    static {
        /* Arrays must be sorted since Arrays.searchBinary() is used later. For more info go to
         * http://java.sun.com/javase/6/docs/api/java/util/Arrays.html#binarySearch(T[], T, java.util.Comparator)
@ -187,6 +172,24 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
        ORDERED, UNORDERED;
    }
    private String orderedListLevel = EMPTY;
    private String unorderedListLevel = EMPTY;
    private String defListLevel = EMPTY;
    private boolean processingCell = false;             //needed for prevention of double-execution of replaceHTML
    private boolean processingDefList = false;          //needed for definition lists
    private boolean escape = false;                     //needed for escape
    private boolean escaped = false;                    //needed for <pre> not getting in the way
    private boolean newRowStart = false;                //needed for the first row not to be empty
    private boolean noList = false;                     //needed for handling of [= and <pre> in lists
    private boolean processingPreformattedText = false; //needed for preformatted text
    private boolean preformattedSpanning = false;       //needed for <pre> and </pre> spanning over several lines
    private boolean replacedHtmlAlready = false;        //indicates if method replaceHTML has been used with line already
    private boolean processingTable = false;            //needed for tables, because they reach over several lines
    private int preindented = 0;                        //needed for indented <pre>s
    private final TableOfContent tableOfContent = new TableOfContent();
    /**
     * Constructor
     * @param address
@ -739,8 +742,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
                    if (d == null || d.isEmpty()) {
                        continue;
                    }
-                    final String a = d.substring(1).replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY);
+                    final String a = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(d.substring(1)).replaceAll("_")).replaceAll(EMPTY);
-                    final String b = element.substring(1).replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY);
+                    final String b = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(element.substring(1)).replaceAll("_")).replaceAll(EMPTY);
                    if (a.equals(b)) {
                        doubles++;
                    }
@ -842,8 +845,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
                            throw new IllegalArgumentException("illegal headline level: " + l);
                        }
                    }
-
+                    directory.append(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(temp).replaceAll("_")).replaceAll(EMPTY));
                    directory.append(temp.replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY));
                    directory.append(anchorext);
                    directory.append("\" class=\"WikiTOC\">");
                    directory.append(element);
@ -889,7 +891,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
                            doubles++;
                        }
                    }
-                    String anchor = direlem.replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY); //replace blanks with underscores and delete everything thats not a regular character, a number or _
+                    String anchor = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(direlem).replaceAll("_")).replaceAll(EMPTY);; //replace blanks with underscores and delete everything thats not a regular character, a number or _
                    //if there are doubles, add underscore and number of doubles plus one
                    if (doubles > 0) {
                        anchor = anchor + "_" + (doubles + 1);
@ -907,11 +909,15 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
                        + input.substring(secondPosition + tags.closeWikiLength);
            }
        }
        return input;
        // commented out the following lines because they caused an endless recursion here
        /*
        //recursion if another pair of the pattern can still be found in the line
        if (((firstPosition = input.indexOf(tags.openWiki)) >= 0) && (input.indexOf(tags.closeWiki, firstPosition + tags.openWikiLength) >= 0)) {
            input = tagReplace(input, tags);
        }
        return input;
        */
    }
    /** Replaces wiki tags with HTML tags in one line of text.
--- a/source/net/yacy/document/importer/MediawikiImporter.java
+++ b/source/net/yacy/document/importer/MediawikiImporter.java
@ -80,7 +80,6 @@ public class MediawikiImporter extends Thread implements Importer {
    public static Importer job; // if started from a servlet, this object is used to store the thread
    protected WikiParser wparser;
    public    File sourcefile;
    public    File targetdir;
    public    int count;
@ -95,7 +94,6 @@ public class MediawikiImporter extends Thread implements Importer {
    	this.docsize = sourcefile.length();
    	this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
    	this.targetdir = targetdir;
        this.wparser = new WikiCode();
        this.count = 0;
        this.start = 0;
        this.hostport = null;
@ -496,6 +494,7 @@ public class MediawikiImporter extends Thread implements Importer {
        }
        public void genHTML() throws IOException {
            try {
                WikiParser wparser = new WikiCode();
                html = wparser.transform(hostport, source);
            } catch (Exception e) {
                Log.logException(e);