yacy_search_server/source/de/anomic/tools/mediawikiIndex.java

// mediawikiIndex.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 20.11.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package de.anomic.tools;

import org.apache.tools.bzip2.CBZip2InputStream;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.kelondro.util.ByteBuffer;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;

/*
 * this class provides data structures to read a mediawiki dump file in xml format
 * as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/"
 */

public class mediawikiIndex extends Thread {

    private static final String textstart = "<text";
    private static final String textend = "</text>";
    private static final String pagestart = "<page>";
    private static final String pageend = "</page>";
    private static final byte[] pagestartb = pagestart.getBytes();
    private static final byte[] pageendb = pageend.getBytes();

    protected wikiParser wparser;
    protected String urlStub;
    public    File sourcefile;
    public    File targetdir;
    public    int count;
    private   long start;
    private   long docsize;
    private   int approxdocs;

    private static final int docspermbinxmlbz2 = 800;  // documents per megabyte in a xml.bz2 wikimedia dump

    public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread

    public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
    	this.sourcefile = sourcefile;
    	this.docsize = sourcefile.length();
    	this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
    	this.targetdir = targetdir;
        this.urlStub = baseURL;
        this.wparser = new wikiCode(new URL(baseURL).getHost());
        this.count = 0;
        this.start = 0;
    }

    /**
     * return the number of articles per second
     * @return
     */
    public int speed() {
        if (count == 0) return 0;
        return (int) ((long) count / runningTime());
    }

    /**
     * return the remaining seconds for the completion of all records in milliseconds
     * @return
     */
    public long remainingTime() {
        return Math.max(0, this.approxdocs - count) / speed();
    }

    public long runningTime() {
        return (System.currentTimeMillis() - start) / 1024;
    }

    public void run() {
        this.start = System.currentTimeMillis();
        try {
            String targetstub = sourcefile.getName();
            targetstub = targetstub.substring(0, targetstub.length() - 8);
            InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1 * 1024 * 1024);
            if (sourcefile.getName().endsWith(".bz2")) {
                int b = is.read();
                if (b != 'B') throw new IOException("Invalid bz2 content.");
                b = is.read();
                if (b != 'Z') throw new IOException("Invalid bz2 content.");
                is = new CBZip2InputStream(is);
            }
            BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
            String t;
            StringBuilder sb = new StringBuilder();
            boolean page = false, text = false;
            String title = null;
            wikiparserrecord poison = newRecord();
            int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
            BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
            BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
            ExecutorService service = Executors.newFixedThreadPool(threads + 1);
            convertConsumer[] consumers = new convertConsumer[threads];
            Future<?>[] consumerResults = new Future[threads];
            for (int i = 0; i < threads; i++) {
                consumers[i] = new convertConsumer(in, out, poison);
                consumerResults[i] = service.submit(consumers[i]);
            }
            convertWriter   writer = new convertWriter(out, poison, targetdir, targetstub);
            Future<Integer> writerResult = service.submit(writer);

            wikiparserrecord record;
            int p;
            while ((t = r.readLine()) != null) {
                if (t.indexOf(pagestart) >= 0) {
                    page = true;
                    continue;
                }
                if ((p = t.indexOf(textstart)) >= 0) {
                    text = page;
                    int q = t.indexOf('>', p + textstart.length());
                    if (q > 0) {
                        int u = t.indexOf(textend, q + 1);
                        if (u > q) {
                            sb.append(t.substring(q + 1, u));
                            Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title);
                            if (sb.length() == 0) {
                                Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
                                continue;
                            }
                            record = newRecord(title, sb);
                            try {
                                in.put(record);
                                this.count++;
                            } catch (InterruptedException e1) {
                                e1.printStackTrace();
                            }
                            sb = new StringBuilder(200);
                            continue;
                        } else {
                            sb.append(t.substring(q + 1));
                        }
                    }
                    continue;
                }
                if (t.indexOf(textend) >= 0) {
                    text = false;
                    Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title);
                    if (sb.length() == 0) {
                        Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
                        continue;
                    }
                    record = newRecord(title, sb);
                    try {
                        in.put(record);
                        this.count++;
                    } catch (InterruptedException e1) {
                        e1.printStackTrace();
                    }
                    sb = new StringBuilder(200);
                    continue;
                }
                if (t.indexOf(pageend) >= 0) {
                    page = false;
                    continue;
                }
                if ((p = t.indexOf("<title>")) >= 0) {
                    title = t.substring(p + 7);
                    int q = title.indexOf("</title>");
                    if (q >= 0) title = title.substring(0, q);
                    continue;
                }
                if (text) {
                    sb.append(t);
                    sb.append('\n');
                }
            }
            r.close();

            try {
                for (int i = 0; i < threads; i++) {
                    in.put(poison);
                }
                for (int i = 0; i < threads; i++) {
                    consumerResults[i].get(10000, TimeUnit.MILLISECONDS);
                }
                out.put(poison);
                writerResult.get(10000, TimeUnit.MILLISECONDS);
            } catch (InterruptedException e1) {
                e1.printStackTrace();
            } catch (ExecutionException e) {
                e.printStackTrace();
            } catch (TimeoutException e) {
                e.printStackTrace();
            } catch (Exception e) {
                e.printStackTrace();
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void checkIndex(File wikimediaxml) {
        File idx = idxFromWikimediaXML(wikimediaxml);
        if (idx.exists()) return;
        new indexMaker(wikimediaxml).start();
    }

    public static class indexMaker extends Thread {

        File wikimediaxml;
        public indexMaker(File wikimediaxml) {
            this.wikimediaxml = wikimediaxml;
        }

        public void run() {
            try {
                createIndex(this.wikimediaxml);
            } catch (final IOException e) {
            } catch (final Exception e) {
                e.printStackTrace();
            }
        }
    }

    public static File idxFromWikimediaXML(File wikimediaxml) {
        return new File(wikimediaxml.getAbsolutePath() + ".idx.xml");
    }

    public static void createIndex(File dumpFile) throws IOException {
        // calculate md5
        //String md5 = serverCodings.encodeMD5Hex(dumpFile);

        // init reader, producer and consumer
        PositionAwareReader in = new PositionAwareReader(dumpFile);
        indexProducer producer = new indexProducer(100, idxFromWikimediaXML(dumpFile));
        wikiConsumer consumer = new wikiConsumer(100, producer);
        ExecutorService service = Executors.newFixedThreadPool(2);
        Future<Integer> producerResult = service.submit(consumer);
        Future<Integer> consumerResult = service.submit(producer);
        service.shutdown();

        // read the wiki dump
        long start, stop;
        while (in.seek(pagestartb)) {
            start = in.pos() - 6;
            in.resetBuffer();
            if (!in.seek(pageendb)) break;
            stop = in.pos();
            consumer.consume(new wikiraw(in.bytes(), start, stop));
            in.resetBuffer();
        }

        // shut down the services
        try {
            consumer.consume(wikiConsumer.poison);
            try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (TimeoutException e) {}
            producer.consume(indexProducer.poison);
            if (!consumerResult.isDone()) consumerResult.get();
            producerResult.get();
        } catch (InterruptedException e) {
            e.printStackTrace();
            return;
        } catch (ExecutionException e) {
            e.printStackTrace();
            return;
        }
        in.close();
    }

    private static class indexProducer implements Callable<Integer> {

        private BlockingQueue<wikisourcerecord> entries;
        PrintWriter out;
        protected static wikisourcerecord poison = new wikisourcerecord("", 0, 0);
        int count;

        public indexProducer(int bufferCount, File indexFile) throws IOException {
            entries = new ArrayBlockingQueue<wikisourcerecord>(bufferCount);
            out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile)));
            count = 0;
            out.println("<index>");

        }

        public void consume(wikisourcerecord b) {
            try {
                entries.put(b);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        public Integer call() {
            wikisourcerecord r;
            try {
                while(true) {
                    r = entries.take();
                    if (r == poison) {
                        Log.logInfo("WIKITRANSLATION", "producer / got poison");
                        break;
                    }
                    out.println("  <page start=\"" + r.start + "\" length=\"" + (r.end - r.start) + "\">");
                    out.println("    <title>" + r.title + "</title>");
                    out.println("  </page>");
                    Log.logInfo("WIKITRANSLATION", "producer / record start: " + r.start + ", title : " + r.title);
                    count++;
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            entries.clear();
            out.println("</index>");
            out.close();
            return Integer.valueOf(count);
        }

    }

    private static class wikiConsumer implements Callable<Integer> {

        private   BlockingQueue<wikiraw> entries;
        protected static wikiraw poison = new wikiraw(new byte[0], 0, 0);
        private   indexProducer producer;
        private   int count;

        public wikiConsumer(int bufferCount, indexProducer producer) {
            entries = new ArrayBlockingQueue<wikiraw>(bufferCount);
            this.producer = producer;
            count = 0;
        }

        public void consume(wikiraw b) {
            try {
                entries.put(b);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        public Integer call() {
            wikisourcerecord r;
            wikiraw c;
            try {
                while(true) {
                    c = entries.take();
                    if (c == poison) {
                        Log.logInfo("WIKITRANSLATION", "consumer / got poison");
                        break;
                    }
                    try {
                        r = new wikisourcerecord(c.b, c.start, c.end);
                        producer.consume(r);
                        Log.logInfo("WIKITRANSLATION", "consumer / record start: " + r.start + ", title : " + r.title);
                        count++;
                    } catch (RuntimeException e) {}
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            entries.clear();
            return Integer.valueOf(count);
        }

    }

    private static class wikiraw {
        public long start, end;
        public byte[] b;
        public wikiraw(byte[] b, long start, long end) {
            this.b = b;
            this.start = start;
            this.end = end;
        }
    }

    public static class wikisourcerecord {
        public long start, end;
        public String title;
        public wikisourcerecord(String title, long start, long end) {
            this.title = title;
            this.start = start;
            this.end = end;
        }
        public wikisourcerecord(byte[] chunk, long start, long end) {
            String s;
            try {
                s = new String(chunk, "UTF-8");
            } catch (UnsupportedEncodingException e) {
                throw new RuntimeException(e.getMessage());
            }
            int t0 = s.indexOf("<title>");
            if (t0 >= 0) {
                int t1 = s.indexOf("</title>", t0);
                if (t1 >= 0) {
                    this.title = s.substring(t0 + 7, t1);
                } else {
                    throw new RuntimeException("no title end in record");
                }
            } else {
                throw new RuntimeException("no title start in record");
            }

            this.start = start;
            this.end = end;
        }
    }
    public wikiparserrecord newRecord() {
        return new wikiparserrecord(null, null);
    }
    public wikiparserrecord newRecord(String title, StringBuilder sb) {
        return new wikiparserrecord(title, sb);
    }

    public class wikiparserrecord {
        public String title;
        String source;
        String html;
        yacyURL url;
        Document document;
        public wikiparserrecord(String title, StringBuilder sb) {
            this.title = title;
            this.source = (sb == null) ? null : sb.toString();
        }
        public void genHTML() throws IOException {
            try {
                html = wparser.transform(source);
            } catch (Exception e) {
            	e.printStackTrace();
                throw new IOException(e.getMessage());
            }
        }
        public void genDocument() throws InterruptedException, ParserException {
            try {
				url = new yacyURL(urlStub + title, null);
				document = Parser.parseSource(url, "text/html", "utf-8", html.getBytes("UTF-8"));
				// the wiki parser is not able to find the proper title in the source text, so it must be set here
				document.setTitle(title);
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			} catch (MalformedURLException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
        }
        public void writeXML(OutputStreamWriter os) throws IOException {
            document.writeXML(os, new Date());
        }
    }

    private static class PositionAwareReader {

        private InputStream is;
        private long seekpos;
        private ByteBuffer bb;

        public PositionAwareReader(File dumpFile) throws FileNotFoundException {
            this.is = new BufferedInputStream(new FileInputStream(dumpFile), 64 *1024);
            this.seekpos = 0;
            this.bb = new ByteBuffer();
        }

        public void resetBuffer() {
            if (bb.length() > 10 * 1024) bb = new ByteBuffer(); else bb.clear();
        }

        public boolean seek(byte[] pattern) throws IOException {
            int pp = 0;
            int c;
            while ((c = is.read()) >= 0) {
                seekpos++;
                bb.append(c);
                if (pattern[pp] == c) pp++; else pp = 0;
                if (pp == pattern.length) return true;
            }
            return false;
        }

        public long pos() {
            return seekpos;
        }

        public byte[] bytes() {
            return bb.getBytes();
        }

        public void close() {
            try {
                is.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    public static byte[] read(File f, long start, int len) {
        byte[] b = new byte[len];
        RandomAccessFile raf = null;
        try {
            raf = new RandomAccessFile(f, "r");
            raf.seek(start);
            raf.read(b);
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        } finally {
            if (raf != null) try {
                raf.close();
                try{raf.getChannel().close();} catch (IOException e) {}
            } catch (IOException e) { }
        }
        return b;
    }

    public static wikisourcerecord find(String title, File f) throws IOException {
        PositionAwareReader in = new PositionAwareReader(f);
        long start;
        String m = "<title>" + title + "</title>";
        String s;
        while (in.seek("<page ".getBytes())) {
            start = in.pos() - 6;
            in.resetBuffer();
            if (!in.seek(pageendb)) break;
            s = new String(in.bytes(), "UTF-8");
            in.resetBuffer();
            if (s.indexOf(m) >= 0) {
                // we found the record
                //Log.logInfo("WIKITRANSLATION", "s = " + s);
                int p = s.indexOf("start=\"");
                if (p < 0) return null;
                p += 7;
                int q = s.indexOf('"', p + 1);
                if (q < 0) return null;
                start = Long.parseLong(s.substring(p, q));
                p = s.indexOf("length=\"", q);
                if (p < 0) return null;
                p += 8;
                q = s.indexOf('"', p + 1);
                if (q < 0) return null;
                int length = Integer.parseInt(s.substring(p, q));
                //Log.logInfo("WIKITRANSLATION", "start = " + start + ", length = " + length);
                return new wikisourcerecord(title, start, start + length);
            }
        }
        return null;
    }

    private static class convertConsumer implements Callable<Integer> {

        private BlockingQueue<wikiparserrecord> in, out;
        private wikiparserrecord poison;

        public convertConsumer(BlockingQueue<wikiparserrecord> in, BlockingQueue<wikiparserrecord> out, wikiparserrecord poison) {
            this.poison = poison;
            this.in = in;
            this.out = out;
        }

        public Integer call() {
            wikiparserrecord record;
            try {
                while(true) {
                    record = in.take();
                    if (record == poison) {
                        Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
                        break;
                    }
                    try {
                        record.genHTML();
                        record.genDocument();
                        out.put(record);
                    } catch (RuntimeException e) {
                        e.printStackTrace();
                    } catch (ParserException e) {
                        e.printStackTrace();
                    } catch (IOException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					}
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            Log.logInfo("WIKITRANSLATION", "*** convertConsumer has terminated");
            return Integer.valueOf(0);
        }

    }

    private static class convertWriter implements Callable<Integer> {

        private BlockingQueue<wikiparserrecord> in;
        private wikiparserrecord poison;
        private OutputStreamWriter osw;
        private String targetstub;
        private File targetdir;
        private int fc, rc;
        private String outputfilename;

        public convertWriter(
                BlockingQueue<wikiparserrecord> in,
                wikiparserrecord poison,
                File targetdir,
                String targetstub) {
            this.poison = poison;
            this.in = in;
            this.osw = null;
            this.targetdir = targetdir;
            this.targetstub = targetstub;
            this.fc = 0;
            this.rc = 0;
            this.outputfilename = null;
        }

        public Integer call() {
            wikiparserrecord record;
            try {
                while(true) {
                    record = in.take();
                    if (record == poison) {
                        Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
                        break;
                    }

                    if (osw == null) {
                        // start writing a new file
                        this.outputfilename = targetstub + "." + fc + ".xml.prt";
                        this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
                        osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
                    }
                    Log.logInfo("WIKITRANSLATION", "[CONSUME] Title: " + record.title);
                    record.document.writeXML(osw, new Date());
                    rc++;
                    if (rc >= 10000) {
                        osw.write("</surrogates>\n");
                        osw.close();
                        String finalfilename = targetstub + "." + fc + ".xml";
                        new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
                        rc = 0;
                        fc++;
                        outputfilename = targetstub + "." + fc + ".xml.prt";
                        osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
                        osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
                    }


                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            } catch (UnsupportedEncodingException e) {
                e.printStackTrace();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
	            try {
					osw.write("</surrogates>\n");
		            osw.close();
		            String finalfilename = targetstub + "." + fc + ".xml";
		            new File(targetdir, outputfilename).renameTo(new File(targetdir, finalfilename));
				} catch (IOException e) {
					e.printStackTrace();
				}
            }
            Log.logInfo("WIKITRANSLATION", "*** convertWriter has terminated");
            return Integer.valueOf(0);
        }

    }

    public static void main(String[] s) {
        if (s.length == 0) {
            Log.logInfo("WIKITRANSLATION", "usage:");
            Log.logInfo("WIKITRANSLATION", " -index <wikipedia-dump>");
            Log.logInfo("WIKITRANSLATION", " -read  <start> <len> <idx-file>");
            Log.logInfo("WIKITRANSLATION", " -find  <title> <wikipedia-dump>");
            Log.logInfo("WIKITRANSLATION", " -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
            System.exit(0);
        }

        // example:
        // java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/

        if (s[0].equals("-convert") && s.length > 2 && s[1].endsWith(".xml.bz2") && s[3].startsWith("http://")) {
            File sourcefile = new File(s[1]);
            File targetdir = new File(s[2]);
            String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
            //String language = urlStub.substring(7,9);
            try {
                mediawikiIndex mi = new mediawikiIndex(sourcefile, targetdir, urlStub);
                mi.start();
                mi.join();
            } catch (InterruptedException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        if (s[0].equals("-index")) {
            try {
                createIndex(new File(s[1]));
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        if (s[0].equals("-read")) {
            long start = Integer.parseInt(s[1]);
            int  len   = Integer.parseInt(s[2]);
            try {
                System.out.println(new String(read(new File(s[3]), start, len), "UTF-8"));
            } catch (UnsupportedEncodingException e) {
                e.printStackTrace();
            }
        }

        if (s[0].equals("-find")) {
            try {
                wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
                if (w == null) {
                    Log.logInfo("WIKITRANSLATION", "not found");
                } else {
                    System.out.println(new String(read(new File(s[2]), w.start, (int) (w.end - w.start)), "UTF-8"));
                }
            } catch (IOException e) {
                e.printStackTrace();
            }

        }
        System.exit(0);
    }

}