fix for mediawiki importer and wikicode parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7651 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent c5352e6872
commit 01690eab86

@ -31,6 +31,7 @@ import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
@ -45,7 +46,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
private static final String EMPTY = ""; private static final String EMPTY = "";
private static final String PIPE_ESCAPED = "|"; private static final String PIPE_ESCAPED = "|";
private static final String REGEX_NOT_CHAR_NUM_OR_UNDERSCORE = "[^a-zA-Z0-9_]"; private static final Pattern REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN = Pattern.compile("[^a-zA-Z0-9_]");
private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
private static enum Tags { private static enum Tags {
HEADLINE_1("=", "<h1>", "</h1>"), HEADLINE_1("=", "<h1>", "</h1>"),
@ -131,8 +133,6 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
private static final int LEN_WIKI_HR_LINE = WIKI_HR_LINE.length(); private static final int LEN_WIKI_HR_LINE = WIKI_HR_LINE.length();
private static final int LEN_PIPE_ESCAPED = PIPE_ESCAPED.length(); private static final int LEN_PIPE_ESCAPED = PIPE_ESCAPED.length();
private final TableOfContent tableOfContent = new TableOfContent();
/** List of properties which can be used in tables. */ /** List of properties which can be used in tables. */
private final static String[] TABLE_PROPERTIES = {"rowspan", "colspan", "vspace", "hspace", "cellspacing", "cellpadding", "border"}; private final static String[] TABLE_PROPERTIES = {"rowspan", "colspan", "vspace", "hspace", "cellspacing", "cellpadding", "border"};
@ -150,21 +150,6 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
private final static char[] HEADLINE_LEVEL = new char[]{ONE, TWO, THREE, FOUR, FIVE, SIX}; private final static char[] HEADLINE_LEVEL = new char[]{ONE, TWO, THREE, FOUR, FIVE, SIX};
private String orderedListLevel = EMPTY;
private String unorderedListLevel = EMPTY;
private String defListLevel = EMPTY;
private boolean processingCell = false; //needed for prevention of double-execution of replaceHTML
private boolean processingDefList = false; //needed for definition lists
private boolean escape = false; //needed for escape
private boolean escaped = false; //needed for <pre> not getting in the way
private boolean newRowStart = false; //needed for the first row not to be empty
private boolean noList = false; //needed for handling of [= and <pre> in lists
private boolean processingPreformattedText = false; //needed for preformatted text
private boolean preformattedSpanning = false; //needed for <pre> and </pre> spanning over several lines
private boolean replacedHtmlAlready = false; //indicates if method replaceHTML has been used with line already
private boolean processingTable = false; //needed for tables, because they reach over several lines
private int preindented = 0; //needed for indented <pre>s
static { static {
/* Arrays must be sorted since Arrays.searchBinary() is used later. For more info go to /* Arrays must be sorted since Arrays.searchBinary() is used later. For more info go to
* http://java.sun.com/javase/6/docs/api/java/util/Arrays.html#binarySearch(T[], T, java.util.Comparator) * http://java.sun.com/javase/6/docs/api/java/util/Arrays.html#binarySearch(T[], T, java.util.Comparator)
@ -187,6 +172,24 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
ORDERED, UNORDERED; ORDERED, UNORDERED;
} }
private String orderedListLevel = EMPTY;
private String unorderedListLevel = EMPTY;
private String defListLevel = EMPTY;
private boolean processingCell = false; //needed for prevention of double-execution of replaceHTML
private boolean processingDefList = false; //needed for definition lists
private boolean escape = false; //needed for escape
private boolean escaped = false; //needed for <pre> not getting in the way
private boolean newRowStart = false; //needed for the first row not to be empty
private boolean noList = false; //needed for handling of [= and <pre> in lists
private boolean processingPreformattedText = false; //needed for preformatted text
private boolean preformattedSpanning = false; //needed for <pre> and </pre> spanning over several lines
private boolean replacedHtmlAlready = false; //indicates if method replaceHTML has been used with line already
private boolean processingTable = false; //needed for tables, because they reach over several lines
private int preindented = 0; //needed for indented <pre>s
private final TableOfContent tableOfContent = new TableOfContent();
/** /**
* Constructor * Constructor
* @param address * @param address
@ -739,8 +742,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
if (d == null || d.isEmpty()) { if (d == null || d.isEmpty()) {
continue; continue;
} }
final String a = d.substring(1).replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY); final String a = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(d.substring(1)).replaceAll("_")).replaceAll(EMPTY);
final String b = element.substring(1).replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY); final String b = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(element.substring(1)).replaceAll("_")).replaceAll(EMPTY);
if (a.equals(b)) { if (a.equals(b)) {
doubles++; doubles++;
} }
@ -842,8 +845,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
throw new IllegalArgumentException("illegal headline level: " + l); throw new IllegalArgumentException("illegal headline level: " + l);
} }
} }
directory.append(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(temp).replaceAll("_")).replaceAll(EMPTY));
directory.append(temp.replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY));
directory.append(anchorext); directory.append(anchorext);
directory.append("\" class=\"WikiTOC\">"); directory.append("\" class=\"WikiTOC\">");
directory.append(element); directory.append(element);
@ -889,7 +891,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
doubles++; doubles++;
} }
} }
String anchor = direlem.replaceAll(" ", "_").replaceAll(REGEX_NOT_CHAR_NUM_OR_UNDERSCORE, EMPTY); //replace blanks with underscores and delete everything thats not a regular character, a number or _ String anchor = REGEX_NOT_CHAR_NUM_OR_UNDERSCORE_PATTERN.matcher(SPACE_PATTERN.matcher(direlem).replaceAll("_")).replaceAll(EMPTY);; //replace blanks with underscores and delete everything thats not a regular character, a number or _
//if there are doubles, add underscore and number of doubles plus one //if there are doubles, add underscore and number of doubles plus one
if (doubles > 0) { if (doubles > 0) {
anchor = anchor + "_" + (doubles + 1); anchor = anchor + "_" + (doubles + 1);
@ -907,11 +909,15 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
+ input.substring(secondPosition + tags.closeWikiLength); + input.substring(secondPosition + tags.closeWikiLength);
} }
} }
return input;
// commented out the following lines because they caused an endless recursion here
/*
//recursion if another pair of the pattern can still be found in the line //recursion if another pair of the pattern can still be found in the line
if (((firstPosition = input.indexOf(tags.openWiki)) >= 0) && (input.indexOf(tags.closeWiki, firstPosition + tags.openWikiLength) >= 0)) { if (((firstPosition = input.indexOf(tags.openWiki)) >= 0) && (input.indexOf(tags.closeWiki, firstPosition + tags.openWikiLength) >= 0)) {
input = tagReplace(input, tags); input = tagReplace(input, tags);
} }
return input; return input;
*/
} }
/** Replaces wiki tags with HTML tags in one line of text. /** Replaces wiki tags with HTML tags in one line of text.

@ -80,7 +80,6 @@ public class MediawikiImporter extends Thread implements Importer {
public static Importer job; // if started from a servlet, this object is used to store the thread public static Importer job; // if started from a servlet, this object is used to store the thread
protected WikiParser wparser;
public File sourcefile; public File sourcefile;
public File targetdir; public File targetdir;
public int count; public int count;
@ -95,7 +94,6 @@ public class MediawikiImporter extends Thread implements Importer {
this.docsize = sourcefile.length(); this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L); this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir; this.targetdir = targetdir;
this.wparser = new WikiCode();
this.count = 0; this.count = 0;
this.start = 0; this.start = 0;
this.hostport = null; this.hostport = null;
@ -496,6 +494,7 @@ public class MediawikiImporter extends Thread implements Importer {
} }
public void genHTML() throws IOException { public void genHTML() throws IOException {
try { try {
WikiParser wparser = new WikiCode();
html = wparser.transform(hostport, source); html = wparser.transform(hostport, source);
} catch (Exception e) { } catch (Exception e) {
Log.logException(e); Log.logException(e);

Loading…
Cancel
Save