some concurrency for wikipedia dump reader

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5855 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent dec495ac78
commit 1b9e532c87

@ -292,6 +292,9 @@ public class mediawikiIndex {
this.end = end; this.end = end;
} }
} }
public wikiparserrecord newRecord() {
return new wikiparserrecord(null, null);
}
public wikiparserrecord newRecord(String title, StringBuilder sb) { public wikiparserrecord newRecord(String title, StringBuilder sb) {
return new wikiparserrecord(title, sb); return new wikiparserrecord(title, sb);
} }
@ -415,29 +418,128 @@ public class mediawikiIndex {
return null; return null;
} }
public static void main(String[] s) { private static class convertConsumer implements Callable<Integer> {
if (s.length == 0) {
System.out.println("usage:"); private BlockingQueue<wikiparserrecord> in, out;
System.out.println(" -index <wikipedia-dump>"); private wikiparserrecord poison;
System.out.println(" -read <start> <len> <idx-file>");
System.out.println(" -find <title> <wikipedia-dump>"); public convertConsumer(BlockingQueue<wikiparserrecord> in, BlockingQueue<wikiparserrecord> out, wikiparserrecord poison) {
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>"); this.poison = poison;
System.exit(0); this.in = in;
this.out = out;
} }
// example: public Integer call() {
// java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ wikiparserrecord record;
try {
while(true) {
record = in.take();
if (record == poison) {
System.out.println("convertConsumer / got poison");
break;
}
try {
record.genHTML();
record.genDocument();
} catch (RuntimeException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (ParserException e) {
e.printStackTrace();
}
out.put(record);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
return Integer.valueOf(0);
}
if (s[0].equals("-convert") && s.length > 2 && s[1].endsWith(".xml.bz2") && s[3].startsWith("http://")) { }
File sourcefile = new File(s[1]);
File targetdir = new File(s[2]); private static class convertWriter implements Callable<Integer> {
private BlockingQueue<wikiparserrecord> in;
private wikiparserrecord poison;
private OutputStreamWriter osw;
private String targetstub;
private File targetdir;
private int fc, rc;
private String outputfilename;
public convertWriter(
BlockingQueue<wikiparserrecord> in,
wikiparserrecord poison,
File targetdir,
String targetstub) {
this.poison = poison;
this.in = in;
this.osw = null;
this.targetdir = targetdir;
this.targetstub = targetstub;
this.fc = 0;
this.rc = 0;
this.outputfilename = null;
}
public Integer call() {
wikiparserrecord record;
try {
while(true) {
record = in.take();
if (record == poison) {
System.out.println("convertConsumer / got poison");
break;
}
if (osw == null) {
// start writing a new file
this.outputfilename = targetstub + "." + fc + ".xml.tmp";
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
}
System.out.println("Title: " + record.title);
record.document.writeXML(osw, new Date());
rc++;
if (rc >= 10000) {
osw.write("</surrogates>\n");
osw.close();
String finalfilename = targetstub + "." + fc + ".xml";
new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
rc = 0;
fc++;
outputfilename = targetstub + "." + fc + ".xml.tmp";
osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
}
osw.write("</surrogates>\n");
osw.close();
String finalfilename = targetstub + "." + fc + ".xml";
new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
}
} catch (InterruptedException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return Integer.valueOf(0);
}
}
public static void convert(File sourcefile, File targetdir, String urlStub) throws IOException {
String targetstub = sourcefile.getName(); String targetstub = sourcefile.getName();
targetstub = targetstub.substring(0, targetstub.length() - 8); targetstub = targetstub.substring(0, targetstub.length() - 8);
String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
//String language = urlStub.substring(7,9);
try {
InputStream is = new FileInputStream(sourcefile); InputStream is = new FileInputStream(sourcefile);
if (s[1].endsWith(".bz2")) { if (sourcefile.getName().endsWith(".bz2")) {
int b = is.read(); int b = is.read();
if (b != 'B') throw new IOException("Invalid bz2 content."); if (b != 'B') throw new IOException("Invalid bz2 content.");
b = is.read(); b = is.read();
@ -452,12 +554,16 @@ public class mediawikiIndex {
plasmaParser.initHTMLParsableMimeTypes("text/html"); plasmaParser.initHTMLParsableMimeTypes("text/html");
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html"); plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
mediawikiIndex mi = new mediawikiIndex(urlStub); mediawikiIndex mi = new mediawikiIndex(urlStub);
BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(10);
BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(10);
wikiparserrecord poison = mi.newRecord();
ExecutorService service = Executors.newFixedThreadPool(2);
convertConsumer consumer = new convertConsumer(in, out, poison);
Future<Integer> consumerResult = service.submit(consumer);
convertWriter writer = new convertWriter(out, poison, targetdir, targetstub);
Future<Integer> writerResult = service.submit(writer);
wikiparserrecord record; wikiparserrecord record;
int fc = 0;
int rc = 0;
String outputfilename = targetstub + "." + fc + ".xml.tmp";
OutputStreamWriter osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
while ((t = r.readLine()) != null) { while ((t = r.readLine()) != null) {
if (t.indexOf(pagestart) >= 0) { if (t.indexOf(pagestart) >= 0) {
page = true; page = true;
@ -471,24 +577,10 @@ public class mediawikiIndex {
text = false; text = false;
System.out.println("Title: " + title); System.out.println("Title: " + title);
record = mi.newRecord(title, sb); record = mi.newRecord(title, sb);
record.genHTML();
try { try {
record.genDocument(); in.put(record);
record.document.writeXML(osw, new Date()); } catch (InterruptedException e1) {
rc++; e1.printStackTrace();
if (rc >= 10000) {
osw.write("</surrogates>\n");
osw.close();
String finalfilename = targetstub + "." + fc + ".xml";
new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
rc = 0;
fc++;
outputfilename = targetstub + "." + fc + ".xml.tmp";
osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
}
} catch (InterruptedException e) {
} catch (ParserException e) {
} }
sb.setLength(0); sb.setLength(0);
continue; continue;
@ -508,11 +600,43 @@ public class mediawikiIndex {
sb.append('\n'); sb.append('\n');
} }
} }
osw.write("</surrogates>\n");
osw.close();
String finalfilename = targetstub + "." + fc + ".xml";
new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
r.close(); r.close();
try {
in.put(poison);
consumerResult.get(10000, TimeUnit.MILLISECONDS);
out.put(poison);
writerResult.get(10000, TimeUnit.MILLISECONDS);
} catch (InterruptedException e1) {
e1.printStackTrace();
} catch (ExecutionException e) {
e.printStackTrace();
} catch (TimeoutException e) {
e.printStackTrace();
}
}
public static void main(String[] s) {
if (s.length == 0) {
System.out.println("usage:");
System.out.println(" -index <wikipedia-dump>");
System.out.println(" -read <start> <len> <idx-file>");
System.out.println(" -find <title> <wikipedia-dump>");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
System.exit(0);
}
// example:
// java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert") && s.length > 2 && s[1].endsWith(".xml.bz2") && s[3].startsWith("http://")) {
File sourcefile = new File(s[1]);
File targetdir = new File(s[2]);
String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
//String language = urlStub.substring(7,9);
try {
convert(sourcefile, targetdir, urlStub);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }

Loading…
Cancel
Save