|
|
@ -85,8 +85,8 @@ public class mediawikiIndex {
|
|
|
|
wparser = new wikiCode(u.getHost());
|
|
|
|
wparser = new wikiCode(u.getHost());
|
|
|
|
hparser = new plasmaParser();
|
|
|
|
hparser = new plasmaParser();
|
|
|
|
// must be called before usage:
|
|
|
|
// must be called before usage:
|
|
|
|
//plasmaParser.initHTMLParsableMimeTypes("text/html");
|
|
|
|
plasmaParser.initHTMLParsableMimeTypes("text/html");
|
|
|
|
//plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
|
|
|
|
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public static void checkIndex(File wikimediaxml) {
|
|
|
|
public static void checkIndex(File wikimediaxml) {
|
|
|
@ -309,9 +309,13 @@ public class mediawikiIndex {
|
|
|
|
this.title = title;
|
|
|
|
this.title = title;
|
|
|
|
this.source = sb;
|
|
|
|
this.source = sb;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
public void genHTML() throws MalformedURLException {
|
|
|
|
public void genHTML() throws IOException {
|
|
|
|
html = wparser.transform(source.toString());
|
|
|
|
try {
|
|
|
|
url = new yacyURL("http://de.wikipedia.org/wiki/" + title, null);
|
|
|
|
html = wparser.transform(source.toString());
|
|
|
|
|
|
|
|
url = new yacyURL("http://de.wikipedia.org/wiki/" + title, null);
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
|
|
|
throw new IOException(e.getMessage());
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
public void genDocument() throws InterruptedException, ParserException {
|
|
|
|
public void genDocument() throws InterruptedException, ParserException {
|
|
|
|
document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes());
|
|
|
|
document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes());
|
|
|
@ -444,7 +448,7 @@ public class mediawikiIndex {
|
|
|
|
out.put(record);
|
|
|
|
out.put(record);
|
|
|
|
} catch (RuntimeException e) {
|
|
|
|
} catch (RuntimeException e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
e.printStackTrace();
|
|
|
|
} catch (MalformedURLException e) {
|
|
|
|
} catch (IOException e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
e.printStackTrace();
|
|
|
|
} catch (ParserException e) {
|
|
|
|
} catch (ParserException e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
e.printStackTrace();
|
|
|
@ -500,7 +504,6 @@ public class mediawikiIndex {
|
|
|
|
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
|
|
|
|
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
|
|
|
|
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
|
|
|
|
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
System.out.println("[CONSUME] Title: " + record.title);
|
|
|
|
System.out.println("[CONSUME] Title: " + record.title);
|
|
|
|
record.document.writeXML(osw, new Date());
|
|
|
|
record.document.writeXML(osw, new Date());
|
|
|
|
rc++;
|
|
|
|
rc++;
|
|
|
@ -562,12 +565,12 @@ public class mediawikiIndex {
|
|
|
|
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
|
|
|
|
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
|
|
|
|
mediawikiIndex mi = new mediawikiIndex(urlStub);
|
|
|
|
mediawikiIndex mi = new mediawikiIndex(urlStub);
|
|
|
|
wikiparserrecord poison = mi.newRecord();
|
|
|
|
wikiparserrecord poison = mi.newRecord();
|
|
|
|
int threads = Math.max(1, Runtime.getRuntime().availableProcessors() - 1);
|
|
|
|
int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
|
|
|
|
BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
|
|
|
|
BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
|
|
|
|
BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
|
|
|
|
BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
|
|
|
|
ExecutorService service = Executors.newFixedThreadPool(threads + 1);
|
|
|
|
ExecutorService service = Executors.newFixedThreadPool(threads + 1);
|
|
|
|
convertConsumer[] consumers = new convertConsumer[threads];
|
|
|
|
convertConsumer[] consumers = new convertConsumer[threads];
|
|
|
|
Future<Integer>[] consumerResults = new Future[threads];
|
|
|
|
Future<?>[] consumerResults = new Future[threads];
|
|
|
|
for (int i = 0; i < threads; i++) {
|
|
|
|
for (int i = 0; i < threads; i++) {
|
|
|
|
consumers[i] = new convertConsumer(in, out, poison);
|
|
|
|
consumers[i] = new convertConsumer(in, out, poison);
|
|
|
|
consumerResults[i] = service.submit(consumers[i]);
|
|
|
|
consumerResults[i] = service.submit(consumers[i]);
|
|
|
|