enhanced the wikimedia dump import process

enhanced the wiki parser and condenser speed git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5931 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 89aeb318d3
parent 5fb77116c6
commit 89aeb318d3
6 changed files with 70 additions and 20 deletions
--- a/htroot/IndexImportWikimedia_p.html
+++ b/htroot/IndexImportWikimedia_p.html
@ -12,23 +12,23 @@
    #(import)#
    <p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
-    <form action="IndexImportWikimedia_p.html" method="get" id="importwiki" accept-charset="UTF-8">
+    <form action="IndexImportWikimedia_p.html" method="get">
        <!-- no post method here, we don't want to transmit the whole file, only the path-->
        <fieldset>
          <legend>Wikimedia Dump File Selection: select a 'bz2' file</legend>
          You can import Wikipedia dumps here. An example is the file
          <a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2">
          http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>.
-          <br>
+          <br />
          Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading!
-          <br>
+          <br />
-          <input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" accept="application/x-bzip2">
+          <input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" />
          <input name="submit" type="submit" value="Import Wikimedia Dump" />
        </fieldset>
    </form>
    <p>
    When the import is started, the following happens:
-    <ul>
+    </p><ul>
    <li>The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this:
    <pre>
    &lt;?xml version="1.0" encoding="utf-8"?&gt;
@ -52,14 +52,18 @@
    <li>When a surrogate file is finished with indexing, it is moved to /DATA/SURROGATES/out</li>
    <li>You can recycle processed surrogate files by moving them from /DATA/SURROGATES/out to /DATA/SURROGATES/in</li>
    </ul>
-    </p>
+    <p></p>
    ::
-    <fieldset><legend>Import Process</legend>
+    <form><fieldset><legend>Import Process</legend>
      <dl>
-        <dt>Thread: #[thread]#</dt>
+        <dt>Thread:</dt><dd>#[thread]#</dd>
-        <dt>Processed Wiki Entries: #[count]#</dt>
+        <dt>Dump:</dt><dd>#[dump]#</dd>
        <dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
        <dt>Speed:</dt><dd>#[speed]# articles per second</dd>
        <dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
        <dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
      </dl>    
-    </fieldset>
+    </fieldset></form>
    #(/import)#
    #%env/templates/footer.template%#
--- a/htroot/IndexImportWikimedia_p.java
+++ b/htroot/IndexImportWikimedia_p.java
@ -41,7 +41,13 @@ public class IndexImportWikimedia_p {
            // one import is running, no option to insert anything
            prop.put("import", 1);
            prop.put("import_thread", "running");
            prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
            prop.put("import_count", mediawikiIndex.job.count);
            prop.put("import_speed", mediawikiIndex.job.speed());
            prop.put("import_runningHours", (mediawikiIndex.job.runningTime() / 60) / 60);
            prop.put("import_runningMinutes", (mediawikiIndex.job.runningTime() / 60) % 60);
            prop.put("import_remainingHours", (mediawikiIndex.job.remainingTime() / 60) / 60);
            prop.put("import_remainingMinutes", (mediawikiIndex.job.remainingTime() / 60) % 60);
        } else {
            prop.put("import", 0);
            if (post == null) {
@ -62,7 +68,13 @@ public class IndexImportWikimedia_p {
                        mediawikiIndex.job.start();
                        prop.put("import", 1);
                        prop.put("import_thread", "started");
                        prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
                        prop.put("import_count", 0);
                        prop.put("import_speed", 0);
                        prop.put("import_runningHours", 0);
                        prop.put("import_runningMinutes", 0);
                        prop.put("import_remainingHours", 0);
                        prop.put("import_remainingMinutes", 0);
                    } catch (MalformedURLException e) {
                        e.printStackTrace();
                        prop.put("import", 0);
--- a/source/de/anomic/data/wiki/wikiCode.java
+++ b/source/de/anomic/data/wiki/wikiCode.java
@ -548,6 +548,7 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
                    break;
                }
                element = dirElements.get(i);
                if (element == null) continue;
                //counting double headlines
                doubles = 0;
                for (int j = 0; j < i; j++) {
--- a/source/de/anomic/plasma/parser/Condenser.java
+++ b/source/de/anomic/plasma/parser/Condenser.java
@ -600,8 +600,12 @@ public final class Condenser {
    }
    static StringBuilder trim(StringBuilder sb) {
-        while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
+        int i = 0;
-        while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
+        while (i < sb.length() && sb.charAt(i) <= ' ') i++;
        if (i > 0) sb.delete(0, i);
        i = sb.length() - 1;
        while (i >= 0 && i < sb.length() && sb.charAt(i) <= ' ') i--;
        if (i > 0) sb.delete(i + 1, sb.length());
        return sb;
    }
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -508,14 +508,14 @@ public final class plasmaWordIndex {
            } else {
                // here we have three results: we can do a voting
                if (language.equals(bymetadata)) {
-                    System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
+                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
                } else if (language.equals(entry.url().language())) {
-                    System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
+                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
                } else if (bymetadata.equals(entry.url().language())) {
-                    System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
+                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
                    language = bymetadata;
                } else {
-                    System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
+                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
                    language = bymetadata;
                }
            }
--- a/source/de/anomic/tools/mediawikiIndex.java
+++ b/source/de/anomic/tools/mediawikiIndex.java
@ -82,30 +82,59 @@ public class mediawikiIndex extends Thread {
    private wikiParser wparser;
    private plasmaParser hparser;
    private String urlStub;
-    private File sourcefile;
+    public File sourcefile;
-    private File targetdir;
+    public File targetdir;
    public int count;
    private long start;
    private long docsize;
    private int approxdocs;
    private static final int docspermbinxmlbz2 = 800;  // documents per megabyte in a xml.bz2 wikimedia dump
    public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
    public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
    	this.sourcefile = sourcefile;
    	this.docsize = sourcefile.length();
    	this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
    	this.targetdir = targetdir;
        this.urlStub = baseURL;
        this.wparser = new wikiCode(new URL(baseURL).getHost());
        this.hparser = new plasmaParser();
        this.count = 0;
        this.start = 0;
        // must be called before usage:
        plasmaParser.initHTMLParsableMimeTypes("text/html");
        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
    }
    /**
     * return the number of articles per second
     * @return
     */
    public int speed() {
        if (count == 0) return 0;
        return (int) ((long) count / runningTime());
    }
    /**
     * return the remaining seconds for the completion of all records in milliseconds
     * @return
     */
    public long remainingTime() {
        return Math.max(0, this.approxdocs - count) / speed();
    }
    public long runningTime() {
        return (System.currentTimeMillis() - start) / 1024;
    }
    public void run() {
        this.start = System.currentTimeMillis();
        try {
            String targetstub = sourcefile.getName();
            targetstub = targetstub.substring(0, targetstub.length() - 8);
-            InputStream is = new FileInputStream(sourcefile);
+            InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1 * 1024 * 1024);
            if (sourcefile.getName().endsWith(".bz2")) {
                int b = is.read();
                if (b != 'B') throw new IOException("Invalid bz2 content.");
@ -113,7 +142,7 @@ public class mediawikiIndex extends Thread {
                if (b != 'Z') throw new IOException("Invalid bz2 content.");
                is = new CBZip2InputStream(is);
            }
-            BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024);
+            BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
            String t;
            StringBuilder sb = new StringBuilder();
            boolean page = false, text = false;