From 89aeb318d3bdf7c61f8d735d8488de1925485b46 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Fri, 8 May 2009 10:36:13 +0000
Subject: [PATCH] enhanced the wikimedia dump import process enhanced the wiki
 parser and condenser speed

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5931 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/IndexImportWikimedia_p.html            | 24 +++++++-----
 htroot/IndexImportWikimedia_p.java            | 12 ++++++
 source/de/anomic/data/wiki/wikiCode.java      |  1 +
 source/de/anomic/plasma/parser/Condenser.java |  8 +++-
 source/de/anomic/plasma/plasmaWordIndex.java  |  8 ++--
 source/de/anomic/tools/mediawikiIndex.java    | 37 +++++++++++++++++--
 6 files changed, 70 insertions(+), 20 deletions(-)
diff --git a/htroot/IndexImportWikimedia_p.html b/htroot/IndexImportWikimedia_p.html
index 104fb84f3..853bfb85c 100644
--- a/htroot/IndexImportWikimedia_p.html
+++ b/htroot/IndexImportWikimedia_p.html
@@ -12,23 +12,23 @@
     
     #(import)#
     <p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
-    <form action="IndexImportWikimedia_p.html" method="get" id="importwiki" accept-charset="UTF-8">
+    <form action="IndexImportWikimedia_p.html" method="get">
         <!-- no post method here, we don't want to transmit the whole file, only the path-->
         <fieldset>
           <legend>Wikimedia Dump File Selection: select a 'bz2' file</legend>
           You can import Wikipedia dumps here. An example is the file
           <a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2">
           http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>.
-          <br>
+          <br />
           Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading!
-          <br>
-          <input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" accept="application/x-bzip2">
+          <br />
+          <input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" />
           <input name="submit" type="submit" value="Import Wikimedia Dump" />
         </fieldset>
     </form>
     <p>
     When the import is started, the following happens:
-    <ul>
+    </p><ul>
     <li>The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this:
     <pre>
     &lt;?xml version="1.0" encoding="utf-8"?&gt;
@@ -52,14 +52,18 @@
     <li>When a surrogate file is finished with indexing, it is moved to /DATA/SURROGATES/out</li>
     <li>You can recycle processed surrogate files by moving them from /DATA/SURROGATES/out to /DATA/SURROGATES/in</li>
     </ul>
-    </p>
+    <p></p>
     ::
-    <fieldset><legend>Import Process</legend>
+    <form><fieldset><legend>Import Process</legend>
       <dl>
-        <dt>Thread: #[thread]#</dt>
-        <dt>Processed Wiki Entries: #[count]#</dt>
+        <dt>Thread:</dt><dd>#[thread]#</dd>
+        <dt>Dump:</dt><dd>#[dump]#</dd>
+        <dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
+        <dt>Speed:</dt><dd>#[speed]# articles per second</dd>
+        <dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
+        <dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
       </dl>    
-    </fieldset>
+    </fieldset></form>
     #(/import)#
     
     #%env/templates/footer.template%#
diff --git a/htroot/IndexImportWikimedia_p.java b/htroot/IndexImportWikimedia_p.java
index 3da6cdeed..23c9695c1 100644
--- a/htroot/IndexImportWikimedia_p.java
+++ b/htroot/IndexImportWikimedia_p.java
@@ -41,7 +41,13 @@ public class IndexImportWikimedia_p {
             // one import is running, no option to insert anything
             prop.put("import", 1);
             prop.put("import_thread", "running");
+            prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
             prop.put("import_count", mediawikiIndex.job.count);
+            prop.put("import_speed", mediawikiIndex.job.speed());
+            prop.put("import_runningHours", (mediawikiIndex.job.runningTime() / 60) / 60);
+            prop.put("import_runningMinutes", (mediawikiIndex.job.runningTime() / 60) % 60);
+            prop.put("import_remainingHours", (mediawikiIndex.job.remainingTime() / 60) / 60);
+            prop.put("import_remainingMinutes", (mediawikiIndex.job.remainingTime() / 60) % 60);
         } else {
             prop.put("import", 0);
             if (post == null) {
@@ -62,7 +68,13 @@ public class IndexImportWikimedia_p {
                         mediawikiIndex.job.start();
                         prop.put("import", 1);
                         prop.put("import_thread", "started");
+                        prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
                         prop.put("import_count", 0);
+                        prop.put("import_speed", 0);
+                        prop.put("import_runningHours", 0);
+                        prop.put("import_runningMinutes", 0);
+                        prop.put("import_remainingHours", 0);
+                        prop.put("import_remainingMinutes", 0);
                     } catch (MalformedURLException e) {
                         e.printStackTrace();
                         prop.put("import", 0);
diff --git a/source/de/anomic/data/wiki/wikiCode.java b/source/de/anomic/data/wiki/wikiCode.java
index 2e41b574d..71b03daee 100644
--- a/source/de/anomic/data/wiki/wikiCode.java
+++ b/source/de/anomic/data/wiki/wikiCode.java
@@ -548,6 +548,7 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
                     break;
                 }
                 element = dirElements.get(i);
+                if (element == null) continue;
                 //counting double headlines
                 doubles = 0;
                 for (int j = 0; j < i; j++) {
diff --git a/source/de/anomic/plasma/parser/Condenser.java b/source/de/anomic/plasma/parser/Condenser.java
index a3aa6a3fc..fa16aff8c 100644
--- a/source/de/anomic/plasma/parser/Condenser.java
+++ b/source/de/anomic/plasma/parser/Condenser.java
@@ -600,8 +600,12 @@ public final class Condenser {
     }
     
     static StringBuilder trim(StringBuilder sb) {
-        while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
-        while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
+        int i = 0;
+        while (i < sb.length() && sb.charAt(i) <= ' ') i++;
+        if (i > 0) sb.delete(0, i);
+        i = sb.length() - 1;
+        while (i >= 0 && i < sb.length() && sb.charAt(i) <= ' ') i--;
+        if (i > 0) sb.delete(i + 1, sb.length());
         return sb;
     }
     
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index e0e0b8a1b..a3e0f3205 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -508,14 +508,14 @@ public final class plasmaWordIndex {
             } else {
                 // here we have three results: we can do a voting
                 if (language.equals(bymetadata)) {
-                    System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
+                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
                 } else if (language.equals(entry.url().language())) {
-                    System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
+                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
                 } else if (bymetadata.equals(entry.url().language())) {
-                    System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
+                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
                     language = bymetadata;
                 } else {
-                    System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
+                    //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
                     language = bymetadata;
                 }
             }
diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java
index 5722fbbec..19d3eed21 100644
--- a/source/de/anomic/tools/mediawikiIndex.java
+++ b/source/de/anomic/tools/mediawikiIndex.java
@@ -82,30 +82,59 @@ public class mediawikiIndex extends Thread {
     private wikiParser wparser;
     private plasmaParser hparser;
     private String urlStub;
-    private File sourcefile;
-    private File targetdir;
+    public File sourcefile;
+    public File targetdir;
     public int count;
+    private long start;
+    private long docsize;
+    private int approxdocs;
+    
+    private static final int docspermbinxmlbz2 = 800;  // documents per megabyte in a xml.bz2 wikimedia dump
     
     public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
     
     public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
     	this.sourcefile = sourcefile;
+    	this.docsize = sourcefile.length();
+    	this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
     	this.targetdir = targetdir;
         this.urlStub = baseURL;
         this.wparser = new wikiCode(new URL(baseURL).getHost());
         this.hparser = new plasmaParser();
         this.count = 0;
+        this.start = 0;
         // must be called before usage:
         plasmaParser.initHTMLParsableMimeTypes("text/html");
         plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
     }
     
+    /**
+     * return the number of articles per second
+     * @return
+     */
+    public int speed() {
+        if (count == 0) return 0;
+        return (int) ((long) count / runningTime());
+    }
+    
+    /**
+     * return the remaining seconds for the completion of all records in milliseconds
+     * @return
+     */
+    public long remainingTime() {
+        return Math.max(0, this.approxdocs - count) / speed();
+    }
+    
+    public long runningTime() {
+        return (System.currentTimeMillis() - start) / 1024;
+    }
     
     public void run() {
+        this.start = System.currentTimeMillis();
         try {
             String targetstub = sourcefile.getName();
             targetstub = targetstub.substring(0, targetstub.length() - 8);
-            InputStream is = new FileInputStream(sourcefile);
+            InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1 * 1024 * 1024);
             if (sourcefile.getName().endsWith(".bz2")) {
                 int b = is.read();
                 if (b != 'B') throw new IOException("Invalid bz2 content.");
@@ -113,7 +142,7 @@ public class mediawikiIndex extends Thread {
                 if (b != 'Z') throw new IOException("Invalid bz2 content.");
                 is = new CBZip2InputStream(is);
             }
-            BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024);
+            BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
             String t;
             StringBuilder sb = new StringBuilder();
             boolean page = false, text = false;