fixes for wiki parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5905 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 3a64c9d02f
commit 9c6ac43f66

@ -283,15 +283,18 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
int i = ListLevel.length();
String tmp = "";
while(! result.startsWith(ListLevel.substring(0,i))){
while (ListLevel.length() >= i && !result.startsWith(ListLevel.substring(0,i))) {
tmp += "</ul>";
i--;
}
ListLevel = ListLevel.substring(0,i);
p0 = ListLevel.length();
if (i < p0) {
ListLevel = ListLevel.substring(0,i);
p0 = ListLevel.length();
}
p1 = result.length();
if(ListLevel.length() > 0){
if (ListLevel.length() > 0) {
result = tmp +
"<li>" +
result.substring(p0, p1) +
@ -551,15 +554,19 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
int level3 = 0;
int doubles = 0;
String anchorext = "";
if((s=dirElements.size())>2){
for(int i=0;i<s;i++){
if ((s = dirElements.size()) > 2) {
for (int i = 0; i < s; i++) {
if (i >= dirElements.size()) break;
element = dirElements.get(i);
//counting double headlines
doubles = 0;
for(int j=0;j<i;j++){
if(dirElements.get(j).substring(1).replaceAll(" ","_").replaceAll("[^a-zA-Z0-9_]","").equals(element.substring(1).replaceAll(" ","_").replaceAll("[^a-zA-Z0-9_]",""))){
doubles++;
}
for (int j = 0; j < i; j++) {
if (j >= dirElements.size()) break;
String d = dirElements.get(j);
if (d == null || d.length() < 1) continue;
String a = d.substring(1).replaceAll(" ","_").replaceAll("[^a-zA-Z0-9_]","");
String b = element.substring(1).replaceAll(" ","_").replaceAll("[^a-zA-Z0-9_]","");
if (a.equals(b)) doubles++;
}
//if there are doubles, create anchorextension
if(doubles>0){
@ -635,6 +642,7 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
//counting double headlines
int doubles = 0;
for(int i=0;i<dirElements.size();i++){
if (dirElements.get(i) == null) continue;
if(dirElements.size() > i && dirElements.get(i).substring(1).equals(direlem)){
doubles++;
}

@ -44,6 +44,7 @@ import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
@ -79,10 +80,11 @@ public class mediawikiIndex {
private wikiParser wparser;
private plasmaParser hparser;
private String urlStub;
public mediawikiIndex(String baseURL) throws MalformedURLException {
yacyURL u = new yacyURL(baseURL, null);
wparser = new wikiCode(u.getHost());
urlStub = baseURL;
wparser = new wikiCode(new URL(baseURL).getHost());
hparser = new plasmaParser();
// must be called before usage:
plasmaParser.initHTMLParsableMimeTypes("text/html");
@ -301,24 +303,32 @@ public class mediawikiIndex {
public class wikiparserrecord {
public String title;
StringBuilder source;
String source;
String html;
yacyURL url;
plasmaParserDocument document;
public wikiparserrecord(String title, StringBuilder sb) {
this.title = title;
this.source = sb;
this.source = (sb == null) ? null : sb.toString();
}
public void genHTML() throws IOException {
try {
html = wparser.transform(source.toString());
url = new yacyURL("http://de.wikipedia.org/wiki/" + title, null);
html = wparser.transform(source);
} catch (Exception e) {
e.printStackTrace();
throw new IOException(e.getMessage());
}
}
public void genDocument() throws InterruptedException, ParserException {
document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes());
try {
url = new yacyURL(urlStub + title, null);
document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
public void writeXML(OutputStreamWriter os) throws IOException {
document.writeXML(os, new Date());
@ -448,11 +458,12 @@ public class mediawikiIndex {
out.put(record);
} catch (RuntimeException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParserException e) {
e.printStackTrace();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
} catch (InterruptedException e) {
e.printStackTrace();
@ -591,6 +602,10 @@ public class mediawikiIndex {
if (t.indexOf(textend) >= 0) {
text = false;
System.out.println("[INJECT] Title: " + title);
if (sb.length() == 0) {
System.out.println("ERROR: " + title + " has empty content");
continue;
}
record = mi.newRecord(title, sb);
try {
in.put(record);

Loading…
Cancel
Save