diff --git a/source/de/anomic/data/wiki/WikiCode.java b/source/de/anomic/data/wiki/WikiCode.java index c352e75d8..194304141 100644 --- a/source/de/anomic/data/wiki/WikiCode.java +++ b/source/de/anomic/data/wiki/WikiCode.java @@ -31,6 +31,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.regex.Pattern; import net.yacy.document.parser.html.CharacterCoding; @@ -45,7 +46,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { private static final String EMPTY = ""; private static final String PIPE_ESCAPED = "|"; - private static final String REGEX_NOT_CHAR_NUM_OR_UNDERSCORE = "[^a-zA-Z0-9_]"; + private static final Pattern REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN = Pattern.compile("[^a-zA-Z0-9_]"); + private static final Pattern SPACE_PATTERN = Pattern.compile(" "); private static enum Tags { HEADLINE_1("=", "
not getting in the way - private boolean newRowStart = false; //needed for the first row not to be empty - private boolean noList = false; //needed for handling of [= andin lists - private boolean processingPreformattedText = false; //needed for preformatted text - private boolean preformattedSpanning = false; //needed forandspanning over several lines - private boolean replacedHtmlAlready = false; //indicates if method replaceHTML has been used with line already - private boolean processingTable = false; //needed for tables, because they reach over several lines - private int preindented = 0; //needed for indenteds - static { /* Arrays must be sorted since Arrays.searchBinary() is used later. For more info go to * http://java.sun.com/javase/6/docs/api/java/util/Arrays.html#binarySearch(T[], T, java.util.Comparator) @@ -187,6 +172,24 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { ORDERED, UNORDERED; } + + private String orderedListLevel = EMPTY; + private String unorderedListLevel = EMPTY; + private String defListLevel = EMPTY; + private boolean processingCell = false; //needed for prevention of double-execution of replaceHTML + private boolean processingDefList = false; //needed for definition lists + private boolean escape = false; //needed for escape + private boolean escaped = false; //needed fornot getting in the way + private boolean newRowStart = false; //needed for the first row not to be empty + private boolean noList = false; //needed for handling of [= andin lists + private boolean processingPreformattedText = false; //needed for preformatted text + private boolean preformattedSpanning = false; //needed forandspanning over several lines + private boolean replacedHtmlAlready = false; //indicates if method replaceHTML has been used with line already + private boolean processingTable = false; //needed for tables, because they reach over several lines + private int preindented = 0; //needed for indenteds + + private final TableOfContent tableOfContent = new TableOfContent(); + /** * Constructor * @param address @@ -739,8 +742,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { if (d == null || d.isEmpty()) { continue; } - final String a = d.substring(1).replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY); - final String b = element.substring(1).replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY); + final String a = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(d.substring(1)).replaceAll("_")).replaceAll(EMPTY); + final String b = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(element.substring(1)).replaceAll("_")).replaceAll(EMPTY); if (a.equals(b)) { doubles++; } @@ -842,8 +845,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { throw new IllegalArgumentException("illegal headline level: " + l); } } - - directory.append(temp.replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY)); + directory.append(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(temp).replaceAll("_")).replaceAll(EMPTY)); directory.append(anchorext); directory.append("\" class=\"WikiTOC\">"); directory.append(element); @@ -889,7 +891,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { doubles++; } } - String anchor = direlem.replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY); //replace blanks with underscores and delete everything thats not a regular character, a number or _ + String anchor = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(direlem).replaceAll("_")).replaceAll(EMPTY);; //replace blanks with underscores and delete everything thats not a regular character, a number or _ //if there are doubles, add underscore and number of doubles plus one if (doubles > 0) { anchor = anchor + "_" + (doubles + 1); @@ -907,11 +909,15 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { + input.substring(secondPosition + tags.closeWikiLength); } } + return input; + // commented out the following lines because they caused an endless recursion here + /* //recursion if another pair of the pattern can still be found in the line if (((firstPosition = input.indexOf(tags.openWiki)) >= 0) && (input.indexOf(tags.closeWiki, firstPosition + tags.openWikiLength) >= 0)) { input = tagReplace(input, tags); } return input; + */ } /** Replaces wiki tags with HTML tags in one line of text. diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 8fc1719e3..2112733e0 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -80,7 +80,6 @@ public class MediawikiImporter extends Thread implements Importer { public static Importer job; // if started from a servlet, this object is used to store the thread - protected WikiParser wparser; public File sourcefile; public File targetdir; public int count; @@ -95,7 +94,6 @@ public class MediawikiImporter extends Thread implements Importer { this.docsize = sourcefile.length(); this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L); this.targetdir = targetdir; - this.wparser = new WikiCode(); this.count = 0; this.start = 0; this.hostport = null; @@ -496,6 +494,7 @@ public class MediawikiImporter extends Thread implements Importer { } public void genHTML() throws IOException { try { + WikiParser wparser = new WikiCode(); html = wparser.transform(hostport, source); } catch (Exception e) { Log.logException(e);