From 01690eab86020a00b825f35ccded132506fea525 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 13 Apr 2011 13:22:27 +0000 Subject: [PATCH] fix for mediawiki importer and wikicode parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7651 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/data/wiki/WikiCode.java | 52 +++++++++++-------- .../document/importer/MediawikiImporter.java | 3 +- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/source/de/anomic/data/wiki/WikiCode.java b/source/de/anomic/data/wiki/WikiCode.java index c352e75d8..194304141 100644 --- a/source/de/anomic/data/wiki/WikiCode.java +++ b/source/de/anomic/data/wiki/WikiCode.java @@ -31,6 +31,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.regex.Pattern; import net.yacy.document.parser.html.CharacterCoding; @@ -45,7 +46,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { private static final String EMPTY = ""; private static final String PIPE_ESCAPED = "|"; - private static final String REGEX_NOT_CHAR_NUM_OR_UNDERSCORE = "[^a-zA-Z0-9_]"; + private static final Pattern REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN = Pattern.compile("[^a-zA-Z0-9_]"); + private static final Pattern SPACE_PATTERN = Pattern.compile(" "); private static enum Tags { HEADLINE_1("=", "

", "

"), @@ -131,8 +133,6 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { private static final int LEN_WIKI_HR_LINE = WIKI_HR_LINE.length(); private static final int LEN_PIPE_ESCAPED = PIPE_ESCAPED.length(); - private final TableOfContent tableOfContent = new TableOfContent(); - /** List of properties which can be used in tables. */ private final static String[] TABLE_PROPERTIES = {"rowspan", "colspan", "vspace", "hspace", "cellspacing", "cellpadding", "border"}; @@ -150,21 +150,6 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { private final static char[] HEADLINE_LEVEL = new char[]{ONE, TWO, THREE, FOUR, FIVE, SIX}; - private String orderedListLevel = EMPTY; - private String unorderedListLevel = EMPTY; - private String defListLevel = EMPTY; - private boolean processingCell = false; //needed for prevention of double-execution of replaceHTML - private boolean processingDefList = false; //needed for definition lists - private boolean escape = false; //needed for escape - private boolean escaped = false; //needed for
 not getting in the way
-    private boolean newRowStart = false;                //needed for the first row not to be empty
-    private boolean noList = false;                     //needed for handling of [= and 
 in lists
-    private boolean processingPreformattedText = false; //needed for preformatted text
-    private boolean preformattedSpanning = false;       //needed for 
 and 
spanning over several lines - private boolean replacedHtmlAlready = false; //indicates if method replaceHTML has been used with line already - private boolean processingTable = false; //needed for tables, because they reach over several lines - private int preindented = 0; //needed for indented
s
-
     static {
         /* Arrays must be sorted since Arrays.searchBinary() is used later. For more info go to
          * http://java.sun.com/javase/6/docs/api/java/util/Arrays.html#binarySearch(T[], T, java.util.Comparator)
@@ -187,6 +172,24 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
         ORDERED, UNORDERED;
     }
 
+    
+    private String orderedListLevel = EMPTY;
+    private String unorderedListLevel = EMPTY;
+    private String defListLevel = EMPTY;
+    private boolean processingCell = false;             //needed for prevention of double-execution of replaceHTML
+    private boolean processingDefList = false;          //needed for definition lists
+    private boolean escape = false;                     //needed for escape
+    private boolean escaped = false;                    //needed for 
 not getting in the way
+    private boolean newRowStart = false;                //needed for the first row not to be empty
+    private boolean noList = false;                     //needed for handling of [= and 
 in lists
+    private boolean processingPreformattedText = false; //needed for preformatted text
+    private boolean preformattedSpanning = false;       //needed for 
 and 
spanning over several lines + private boolean replacedHtmlAlready = false; //indicates if method replaceHTML has been used with line already + private boolean processingTable = false; //needed for tables, because they reach over several lines + private int preindented = 0; //needed for indented
s
+
+    private final TableOfContent tableOfContent = new TableOfContent();
+    
     /**
      * Constructor
      * @param address
@@ -739,8 +742,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
                     if (d == null || d.isEmpty()) {
                         continue;
                     }
-                    final String a = d.substring(1).replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY);
-                    final String b = element.substring(1).replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY);
+                    final String a = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(d.substring(1)).replaceAll("_")).replaceAll(EMPTY);
+                    final String b = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(element.substring(1)).replaceAll("_")).replaceAll(EMPTY);
                     if (a.equals(b)) {
                         doubles++;
                     }
@@ -842,8 +845,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
                             throw new IllegalArgumentException("illegal headline level: " + l);
                         }
                     }
-
-                    directory.append(temp.replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY));
+                    directory.append(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(temp).replaceAll("_")).replaceAll(EMPTY));
                     directory.append(anchorext);
                     directory.append("\" class=\"WikiTOC\">");
                     directory.append(element);
@@ -889,7 +891,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
                             doubles++;
                         }
                     }
-                    String anchor = direlem.replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY); //replace blanks with underscores and delete everything thats not a regular character, a number or _
+                    String anchor = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(direlem).replaceAll("_")).replaceAll(EMPTY);; //replace blanks with underscores and delete everything thats not a regular character, a number or _
                     //if there are doubles, add underscore and number of doubles plus one
                     if (doubles > 0) {
                         anchor = anchor + "_" + (doubles + 1);
@@ -907,11 +909,15 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
                         + input.substring(secondPosition + tags.closeWikiLength);
             }
         }
+        return input;
+        // commented out the following lines because they caused an endless recursion here
+        /*
         //recursion if another pair of the pattern can still be found in the line
         if (((firstPosition = input.indexOf(tags.openWiki)) >= 0) && (input.indexOf(tags.closeWiki, firstPosition + tags.openWikiLength) >= 0)) {
             input = tagReplace(input, tags);
         }
         return input;
+        */
     }
 
     /** Replaces wiki tags with HTML tags in one line of text.
diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java
index 8fc1719e3..2112733e0 100644
--- a/source/net/yacy/document/importer/MediawikiImporter.java
+++ b/source/net/yacy/document/importer/MediawikiImporter.java
@@ -80,7 +80,6 @@ public class MediawikiImporter extends Thread implements Importer {
     
     public static Importer job; // if started from a servlet, this object is used to store the thread
     
-    protected WikiParser wparser;
     public    File sourcefile;
     public    File targetdir;
     public    int count;
@@ -95,7 +94,6 @@ public class MediawikiImporter extends Thread implements Importer {
     	this.docsize = sourcefile.length();
     	this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
     	this.targetdir = targetdir;
-        this.wparser = new WikiCode();
         this.count = 0;
         this.start = 0;
         this.hostport = null;
@@ -496,6 +494,7 @@ public class MediawikiImporter extends Thread implements Importer {
         }
         public void genHTML() throws IOException {
             try {
+                WikiParser wparser = new WikiCode();
                 html = wparser.transform(hostport, source);
             } catch (Exception e) {
                 Log.logException(e);