From 5fb77116c68386238daf96e06b2bdb3220621a91 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Fri, 8 May 2009 07:54:10 +0000
Subject: [PATCH] added a submenu to index administration to import a wikimedia
 dump (i.e. a dump from wikipedia) into the YaCy index: see
 http://localhost:8080/IndexImportWikimedia_p.html

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5930 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/IndexImportWikimedia_p.html            |  67 +++++
 htroot/IndexImportWikimedia_p.java            |  78 +++++
 .../templates/submenuIndexControl.template    |   1 +
 source/de/anomic/kelondro/text/IndexCell.java |   2 +-
 .../anomic/plasma/plasmaParserDocument.java   |   6 +-
 source/de/anomic/tools/mediawikiIndex.java    | 284 ++++++++++--------
 6 files changed, 318 insertions(+), 120 deletions(-)
 create mode 100644 htroot/IndexImportWikimedia_p.html
 create mode 100644 htroot/IndexImportWikimedia_p.java
diff --git a/htroot/IndexImportWikimedia_p.html b/htroot/IndexImportWikimedia_p.html
new file mode 100644
index 000000000..104fb84f3
--- /dev/null
+++ b/htroot/IndexImportWikimedia_p.html
@@ -0,0 +1,67 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <title>YaCy '#[clientname]#': Wikimedia Dump Import</title>
+    #%env/templates/metas.template%#
+    #(import)#::<meta http-equiv="REFRESH" content="10" />#(/import)#
+  </head>
+  <body id="IndexImportWikimedia">
+    #%env/templates/header.template%#
+    #%env/templates/submenuIndexControl.template%#
+    <h2>Wikimedia Dump Import</h2>
+    
+    #(import)#
+    <p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
+    <form action="IndexImportWikimedia_p.html" method="get" id="importwiki" accept-charset="UTF-8">
+        <!-- no post method here, we don't want to transmit the whole file, only the path-->
+        <fieldset>
+          <legend>Wikimedia Dump File Selection: select a 'bz2' file</legend>
+          You can import Wikipedia dumps here. An example is the file
+          <a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2">
+          http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>.
+          <br>
+          Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading!
+          <br>
+          <input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" accept="application/x-bzip2">
+          <input name="submit" type="submit" value="Import Wikimedia Dump" />
+        </fieldset>
+    </form>
+    <p>
+    When the import is started, the following happens:
+    <ul>
+    <li>The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this:
+    <pre>
+    &lt;?xml version="1.0" encoding="utf-8"?&gt;
+&lt;surrogates xmlns:dc="http://purl.org/dc/elements/1.1/"&gt;
+  &lt;record&gt;
+    &lt;dc:Title&gt;&lt;![CDATA[Alan Smithee]]&gt;&lt;/dc:Title&gt;
+    &lt;dc:Identifier&gt;http://de.wikipedia.org/wiki/Alan%20Smithee&lt;/dc:Identifier&gt;
+    &lt;dc:Description&gt;&lt;![CDATA[Der als Filmregisseur oft genannte Alan Smithee ist ein Anagramm]]&gt;&lt;/dc:Description&gt;
+    &lt;dc:Language&gt;de&lt;/dc:Language&gt;
+    &lt;dc:Date&gt;2009-05-07T06:03:48Z&lt;/dc:Date&gt;
+  &lt;/record&gt;
+  &lt;record&gt;
+    ...
+  &lt;/record&gt; 
+&lt;/surrogates&gt;
+    </pre>
+    </li>
+    <li>Each 10000 wiki records are combined in one output file which is written to /DATA/SURROGATES/in into a temporary file.</li>
+    <li>When each of the generated output file is finished, it is renamed to a .xml file</li>
+    <li>Each time a xml surrogate file appears in /DATA/SURROGATES/in, the YaCy indexer fetches the file and indexes the record entries.</li>
+    <li>When a surrogate file is finished with indexing, it is moved to /DATA/SURROGATES/out</li>
+    <li>You can recycle processed surrogate files by moving them from /DATA/SURROGATES/out to /DATA/SURROGATES/in</li>
+    </ul>
+    </p>
+    ::
+    <fieldset><legend>Import Process</legend>
+      <dl>
+        <dt>Thread: #[thread]#</dt>
+        <dt>Processed Wiki Entries: #[count]#</dt>
+      </dl>    
+    </fieldset>
+    #(/import)#
+    
+    #%env/templates/footer.template%#
+  </body>
+</html>
\ No newline at end of file
diff --git a/htroot/IndexImportWikimedia_p.java b/htroot/IndexImportWikimedia_p.java
new file mode 100644
index 000000000..3da6cdeed
--- /dev/null
+++ b/htroot/IndexImportWikimedia_p.java
@@ -0,0 +1,78 @@
+// IndexImportWikimedia.java
+// -------------------------
+// (C) 2009 by Michael Peter Christen; mc@yacy.net
+// first published 04.05.2009 on http://yacy.net
+// Frankfurt, Germany
+//
+// $LastChangedDate: 2009-04-16 17:29:00 +0200 (Do, 16 Apr 2009) $
+// $LastChangedRevision: 5812 $
+// $LastChangedBy: orbiter $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import java.io.File;
+import java.net.MalformedURLException;
+
+import de.anomic.http.httpRequestHeader;
+import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverObjects;
+import de.anomic.server.serverSwitch;
+import de.anomic.tools.mediawikiIndex;
+
+public class IndexImportWikimedia_p {
+
+    public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch<?> env) {
+        final serverObjects prop = new serverObjects();
+        final plasmaSwitchboard sb = (plasmaSwitchboard) env;
+
+        if (mediawikiIndex.job != null && mediawikiIndex.job.isAlive()) {
+            // one import is running, no option to insert anything
+            prop.put("import", 1);
+            prop.put("import_thread", "running");
+            prop.put("import_count", mediawikiIndex.job.count);
+        } else {
+            prop.put("import", 0);
+            if (post == null) {
+                prop.put("import_status", 0);
+            } else {
+                if (post.containsKey("file")) {
+                    File sourcefile = new File(post.get("file"));
+                    String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2
+                    if (!name.endsWith("pages-articles.xml.bz2")) {
+                        prop.put("import", 0);
+                        prop.put("import_status", 1);
+                        prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'");
+                        return prop;
+                    }
+                    String lang = name.substring(0, 2);
+                    try {
+                        mediawikiIndex.job = new mediawikiIndex(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
+                        mediawikiIndex.job.start();
+                        prop.put("import", 1);
+                        prop.put("import_thread", "started");
+                        prop.put("import_count", 0);
+                    } catch (MalformedURLException e) {
+                        e.printStackTrace();
+                        prop.put("import", 0);
+                        prop.put("import_status", 1);
+                        prop.put("import_status_message", e.getMessage());
+                    }
+                }
+                return prop;
+            }
+        }
+        return prop;
+    }
+}
diff --git a/htroot/env/templates/submenuIndexControl.template b/htroot/env/templates/submenuIndexControl.template
index 599f91846..7b0759dad 100644
--- a/htroot/env/templates/submenuIndexControl.template
+++ b/htroot/env/templates/submenuIndexControl.template
@@ -6,5 +6,6 @@
     <li><a href="/IndexImport_p.html" class="MenuItemLink lock">Queue Import</a></li>
     <li><a href="/IndexTransfer_p.html" class="MenuItemLink lock">Index Transfer</a></li>
     <li><a href="/IndexCleaner_p.html" class="MenuItemLink lock">Index Cleaner</a></li>
+    <li><a href="/IndexImportWikimedia_p.html" class="MenuItemLink lock">Wikimedia Dump Import</a></li>
   </ul>
 </div>
diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java
index 11a697419..cbd8b4652 100644
--- a/source/de/anomic/kelondro/text/IndexCell.java
+++ b/source/de/anomic/kelondro/text/IndexCell.java
@@ -168,7 +168,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
         if (c0 == null) return c1;
         return c1.merge(c0);
     }
-
+    
     /**
      * remove url references from a selected word hash. this deletes also in the BLOB
      * files, which means that there exists new gap entries after the deletion
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index 187d5cff5..774920dc0 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -55,7 +55,7 @@ public class plasmaParserDocument {
     private final String mimeType;              // mimeType as taken from http header
     private final String charset;               // the charset of the document
     private final List<String> keywords;        // most resources provide a keyword field
-    private final StringBuilder title;          // a document title, taken from title or h1 tag; shall appear as headline of search result
+    private       StringBuilder title;          // a document title, taken from title or h1 tag; shall appear as headline of search result
     private final StringBuilder creator;        // author or copyright
     private final List<String>  sections;       // if present: more titles/headlines appearing in the document
     private final StringBuilder description;    // an abstract, if present: short content description
@@ -173,6 +173,10 @@ dc_rights
         return title.toString();
     }
 
+    public void setTitle(String title) {
+        this.title = new StringBuilder(title);
+    }
+    
     public String dc_creator() {
         if (creator == null)
             return "";
diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java
index 99288e5a7..5722fbbec 100644
--- a/source/de/anomic/tools/mediawikiIndex.java
+++ b/source/de/anomic/tools/mediawikiIndex.java
@@ -59,6 +59,7 @@ import java.util.concurrent.TimeoutException;
 import de.anomic.data.wiki.wikiCode;
 import de.anomic.data.wiki.wikiParser;
 import de.anomic.kelondro.util.ByteBuffer;
+import de.anomic.kelondro.util.Log;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.parser.ParserException;
@@ -69,7 +70,7 @@ import de.anomic.yacy.yacyURL;
  * as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/"
  */
 
-public class mediawikiIndex {
+public class mediawikiIndex extends Thread {
 
     private static final String textstart = "<text";
     private static final String textend = "</text>";
@@ -81,16 +82,151 @@ public class mediawikiIndex {
     private wikiParser wparser;
     private plasmaParser hparser;
     private String urlStub;
+    private File sourcefile;
+    private File targetdir;
+    public int count;
     
-    public mediawikiIndex(String baseURL) throws MalformedURLException {
-    	urlStub = baseURL;
-        wparser = new wikiCode(new URL(baseURL).getHost());
-        hparser = new plasmaParser();
+    public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
+    
+    public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
+    	this.sourcefile = sourcefile;
+    	this.targetdir = targetdir;
+        this.urlStub = baseURL;
+        this.wparser = new wikiCode(new URL(baseURL).getHost());
+        this.hparser = new plasmaParser();
+        this.count = 0;
         // must be called before usage:
         plasmaParser.initHTMLParsableMimeTypes("text/html");
         plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
     }
     
+    
+    public void run() {
+        try {
+            String targetstub = sourcefile.getName();
+            targetstub = targetstub.substring(0, targetstub.length() - 8);
+            InputStream is = new FileInputStream(sourcefile);
+            if (sourcefile.getName().endsWith(".bz2")) {
+                int b = is.read();
+                if (b != 'B') throw new IOException("Invalid bz2 content.");
+                b = is.read();
+                if (b != 'Z') throw new IOException("Invalid bz2 content.");
+                is = new CBZip2InputStream(is);
+            }
+            BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024);
+            String t;
+            StringBuilder sb = new StringBuilder();
+            boolean page = false, text = false;
+            String title = null;
+            plasmaParser.initHTMLParsableMimeTypes("text/html");
+            plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
+            wikiparserrecord poison = newRecord();
+            int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
+            BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
+            BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
+            ExecutorService service = Executors.newFixedThreadPool(threads + 1);
+            convertConsumer[] consumers = new convertConsumer[threads];
+            Future<?>[] consumerResults = new Future[threads];
+            for (int i = 0; i < threads; i++) {
+                consumers[i] = new convertConsumer(in, out, poison);
+                consumerResults[i] = service.submit(consumers[i]);
+            }
+            convertWriter   writer = new convertWriter(out, poison, targetdir, targetstub);
+            Future<Integer> writerResult = service.submit(writer);
+            
+            wikiparserrecord record;
+            int p;
+            while ((t = r.readLine()) != null) {
+                if (t.indexOf(pagestart) >= 0) {
+                    page = true;
+                    continue;
+                }
+                if ((p = t.indexOf(textstart)) >= 0) {
+                    text = page;
+                    int q = t.indexOf('>', p + textstart.length());
+                    if (q > 0) {
+                        int u = t.indexOf(textend, q + 1);
+                        if (u > q) {
+                            sb.append(t.substring(q + 1, u));
+                            Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title);
+                            if (sb.length() == 0) {
+                                Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
+                                continue;
+                            }
+                            record = newRecord(title, sb);
+                            try {
+                                in.put(record);
+                                this.count++;
+                            } catch (InterruptedException e1) {
+                                e1.printStackTrace();
+                            }
+                            sb = new StringBuilder(200);
+                            continue;
+                        } else {
+                            sb.append(t.substring(q + 1));
+                        }
+                    }
+                    continue;
+                }
+                if (t.indexOf(textend) >= 0) {
+                    text = false;
+                    Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title);
+                    if (sb.length() == 0) {
+                        Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
+                        continue;
+                    }
+                    record = newRecord(title, sb);
+                    try {
+                        in.put(record);
+                        this.count++;
+                    } catch (InterruptedException e1) {
+                        e1.printStackTrace();
+                    }
+                    sb = new StringBuilder(200);
+                    continue;
+                }
+                if (t.indexOf(pageend) >= 0) {
+                    page = false;
+                    continue;
+                }
+                if ((p = t.indexOf("<title>")) >= 0) {
+                    title = t.substring(p + 7);
+                    int q = title.indexOf("</title>");
+                    if (q >= 0) title = title.substring(0, q);
+                    continue;
+                }
+                if (text) {
+                    sb.append(t);
+                    sb.append('\n');
+                }
+            }
+            r.close();
+            
+            try {
+                for (int i = 0; i < threads; i++) {
+                    in.put(poison);
+                }
+                for (int i = 0; i < threads; i++) {
+                    consumerResults[i].get(10000, TimeUnit.MILLISECONDS);
+                }
+                out.put(poison);
+                writerResult.get(10000, TimeUnit.MILLISECONDS);
+            } catch (InterruptedException e1) {
+                e1.printStackTrace();
+            } catch (ExecutionException e) {
+                e.printStackTrace();
+            } catch (TimeoutException e) {
+                e.printStackTrace();
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    
     public static void checkIndex(File wikimediaxml) {
         File idx = idxFromWikimediaXML(wikimediaxml);
         if (idx.exists()) return;
@@ -188,13 +324,13 @@ public class mediawikiIndex {
                 while(true) {
                     r = entries.take();
                     if (r == poison) {
-                        System.out.println("producer / got poison");
+                        Log.logInfo("WIKITRANSLATION", "producer / got poison");
                         break;
                     }
                     out.println("  <page start=\"" + r.start + "\" length=\"" + (r.end - r.start) + "\">");
                     out.println("    <title>" + r.title + "</title>");
                     out.println("  </page>");
-                    System.out.println("producer / record start: " + r.start + ", title : " + r.title);
+                    Log.logInfo("WIKITRANSLATION", "producer / record start: " + r.start + ", title : " + r.title);
                     count++;
                 }
             } catch (InterruptedException e) {
@@ -236,13 +372,13 @@ public class mediawikiIndex {
                 while(true) {
                     c = entries.take();
                     if (c == poison) {
-                        System.out.println("consumer / got poison");
+                        Log.logInfo("WIKITRANSLATION", "consumer / got poison");
                         break;
                     }
                     try {
                         r = new wikisourcerecord(c.b, c.start, c.end);
                         producer.consume(r);
-                        System.out.println("consumer / record start: " + r.start + ", title : " + r.title);
+                        Log.logInfo("WIKITRANSLATION", "consumer / record start: " + r.start + ", title : " + r.title);
                         count++;
                     } catch (RuntimeException e) {}
                 }
@@ -325,6 +461,8 @@ public class mediawikiIndex {
             try {
 				url = new yacyURL(urlStub + title, null);
 				document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes("UTF-8"));
+				// the wiki parser is not able to find the proper title in the source text, so it must be set here
+				document.setTitle(title);
 			} catch (UnsupportedEncodingException e) {
 				e.printStackTrace();
 			} catch (MalformedURLException e1) {
@@ -414,7 +552,7 @@ public class mediawikiIndex {
             in.resetBuffer();
             if (s.indexOf(m) >= 0) {
                 // we found the record
-                //System.out.println("s = " + s);
+                //Log.logInfo("WIKITRANSLATION", "s = " + s);
                 int p = s.indexOf("start=\"");
                 if (p < 0) return null;
                 p += 7;
@@ -427,7 +565,7 @@ public class mediawikiIndex {
                 q = s.indexOf('"', p + 1);
                 if (q < 0) return null;
                 int length = Integer.parseInt(s.substring(p, q));
-                //System.out.println("start = " + start + ", length = " + length);
+                //Log.logInfo("WIKITRANSLATION", "start = " + start + ", length = " + length);
                 return new wikisourcerecord(title, start, start + length);
             }
         }
@@ -451,7 +589,7 @@ public class mediawikiIndex {
                 while(true) {
                     record = in.take();
                     if (record == poison) {
-                        System.out.println("convertConsumer / got poison");
+                        Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
                         break;
                     }
                     try {
@@ -470,7 +608,7 @@ public class mediawikiIndex {
             } catch (InterruptedException e) {
                 e.printStackTrace();
             }
-            System.out.println("*** convertConsumer has terminated");
+            Log.logInfo("WIKITRANSLATION", "*** convertConsumer has terminated");
             return Integer.valueOf(0);
         }
         
@@ -507,7 +645,7 @@ public class mediawikiIndex {
                 while(true) {
                     record = in.take();
                     if (record == poison) {
-                        System.out.println("convertConsumer / got poison");
+                        Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
                         break;
                     }
                     
@@ -517,7 +655,7 @@ public class mediawikiIndex {
                         this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
                         osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
                     }
-                    System.out.println("[CONSUME] Title: " + record.title);
+                    Log.logInfo("WIKITRANSLATION", "[CONSUME] Title: " + record.title);
                     record.document.writeXML(osw, new Date());
                     rc++;
                     if (rc >= 10000) {
@@ -552,114 +690,19 @@ public class mediawikiIndex {
 					e.printStackTrace();
 				}
             }
-            System.out.println("*** convertWriter has terminated");
+            Log.logInfo("WIKITRANSLATION", "*** convertWriter has terminated");
             return Integer.valueOf(0);
         }
         
     }
     
-    public static void convert(File sourcefile, File targetdir, String urlStub) throws IOException {
-        String targetstub = sourcefile.getName();
-        targetstub = targetstub.substring(0, targetstub.length() - 8);
-        InputStream is = new FileInputStream(sourcefile);
-        if (sourcefile.getName().endsWith(".bz2")) {
-            int b = is.read();
-            if (b != 'B') throw new IOException("Invalid bz2 content.");
-            b = is.read();
-            if (b != 'Z') throw new IOException("Invalid bz2 content.");
-            is = new CBZip2InputStream(is);
-        }
-        BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024);
-        String t;
-        StringBuilder sb = new StringBuilder();
-        boolean page = false, text = false;
-        String title = null;
-        plasmaParser.initHTMLParsableMimeTypes("text/html");
-        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
-        mediawikiIndex mi = new mediawikiIndex(urlStub);
-        wikiparserrecord poison = mi.newRecord();
-        int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
-        BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
-        BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
-        ExecutorService service = Executors.newFixedThreadPool(threads + 1);
-        convertConsumer[] consumers = new convertConsumer[threads];
-        Future<?>[] consumerResults = new Future[threads];
-        for (int i = 0; i < threads; i++) {
-        	consumers[i] = new convertConsumer(in, out, poison);
-            consumerResults[i] = service.submit(consumers[i]);
-        }
-        convertWriter   writer = new convertWriter(out, poison, targetdir, targetstub);
-        Future<Integer> writerResult = service.submit(writer);
-        
-        wikiparserrecord record;
-        while ((t = r.readLine()) != null) {
-            if (t.indexOf(pagestart) >= 0) {
-                page = true;
-                continue;
-            }
-            if (t.indexOf(textstart) >= 0) {
-                text = page;
-                continue;
-            }
-            if (t.indexOf(textend) >= 0) {
-                text = false;
-                System.out.println("[INJECT] Title: " + title);
-                if (sb.length() == 0) {
-                	System.out.println("ERROR: " + title + " has empty content");
-                	continue;
-                }
-                record = mi.newRecord(title, sb);
-                try {
-                    in.put(record);
-                } catch (InterruptedException e1) {
-                    e1.printStackTrace();
-                }
-                sb.setLength(0);
-                continue;
-            }
-            if (t.indexOf(pageend) >= 0) {
-                page = false;
-                continue;
-            }
-            if (t.indexOf("<title>") >= 0) {
-                title = t.substring(t.indexOf("<title>") + 7);
-                int p = title.indexOf("</title>");
-                if (p >= 0) title = title.substring(0, p);
-                continue;
-            }
-            if (text) {
-                sb.append(t);
-                sb.append('\n');
-            }
-        }
-        r.close();
-        
-        try {
-        	for (int i = 0; i < threads; i++) {
-        		in.put(poison);
-            }
-        	for (int i = 0; i < threads; i++) {
-        		consumerResults[i].get(10000, TimeUnit.MILLISECONDS);
-            }
-            out.put(poison);
-            writerResult.get(10000, TimeUnit.MILLISECONDS);
-        } catch (InterruptedException e1) {
-            e1.printStackTrace();
-        } catch (ExecutionException e) {
-            e.printStackTrace();
-        } catch (TimeoutException e) {
-            e.printStackTrace();
-        }
-        
-    }
-    
     public static void main(String[] s) {
         if (s.length == 0) {
-            System.out.println("usage:");
-            System.out.println(" -index <wikipedia-dump>");
-            System.out.println(" -read  <start> <len> <idx-file>");
-            System.out.println(" -find  <title> <wikipedia-dump>");
-            System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
+            Log.logInfo("WIKITRANSLATION", "usage:");
+            Log.logInfo("WIKITRANSLATION", " -index <wikipedia-dump>");
+            Log.logInfo("WIKITRANSLATION", " -read  <start> <len> <idx-file>");
+            Log.logInfo("WIKITRANSLATION", " -find  <title> <wikipedia-dump>");
+            Log.logInfo("WIKITRANSLATION", " -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
             System.exit(0);
         }
 
@@ -672,7 +715,11 @@ public class mediawikiIndex {
             String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
             //String language = urlStub.substring(7,9);
             try {
-                convert(sourcefile, targetdir, urlStub);
+                mediawikiIndex mi = new mediawikiIndex(sourcefile, targetdir, urlStub);
+                mi.start();
+                mi.join();
+            } catch (InterruptedException e) {
+                e.printStackTrace();
             } catch (IOException e) {
                 e.printStackTrace();
             }
@@ -700,7 +747,7 @@ public class mediawikiIndex {
             try {
                 wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
                 if (w == null) {
-                    System.out.println("not found");
+                    Log.logInfo("WIKITRANSLATION", "not found");
                 } else {
                     System.out.println(new String(read(new File(s[2]), w.start, (int) (w.end - w.start)), "UTF-8"));
                 }
@@ -709,6 +756,7 @@ public class mediawikiIndex {
             }
             
         }
+        System.exit(0);
     }
     
 }