From 89aeb318d3bdf7c61f8d735d8488de1925485b46 Mon Sep 17 00:00:00 2001
From: orbiter
Date: Fri, 8 May 2009 10:36:13 +0000
Subject: [PATCH] enhanced the wikimedia dump import process enhanced the wiki
parser and condenser speed
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5931 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/IndexImportWikimedia_p.html | 24 +++++++-----
htroot/IndexImportWikimedia_p.java | 12 ++++++
source/de/anomic/data/wiki/wikiCode.java | 1 +
source/de/anomic/plasma/parser/Condenser.java | 8 +++-
source/de/anomic/plasma/plasmaWordIndex.java | 8 ++--
source/de/anomic/tools/mediawikiIndex.java | 37 +++++++++++++++++--
6 files changed, 70 insertions(+), 20 deletions(-)
diff --git a/htroot/IndexImportWikimedia_p.html b/htroot/IndexImportWikimedia_p.html
index 104fb84f3..853bfb85c 100644
--- a/htroot/IndexImportWikimedia_p.html
+++ b/htroot/IndexImportWikimedia_p.html
@@ -12,23 +12,23 @@
#(import)#
#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#
-
When the import is started, the following happens:
-
-
+
::
-
#(/import)#
#%env/templates/footer.template%#
diff --git a/htroot/IndexImportWikimedia_p.java b/htroot/IndexImportWikimedia_p.java
index 3da6cdeed..23c9695c1 100644
--- a/htroot/IndexImportWikimedia_p.java
+++ b/htroot/IndexImportWikimedia_p.java
@@ -41,7 +41,13 @@ public class IndexImportWikimedia_p {
// one import is running, no option to insert anything
prop.put("import", 1);
prop.put("import_thread", "running");
+ prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_count", mediawikiIndex.job.count);
+ prop.put("import_speed", mediawikiIndex.job.speed());
+ prop.put("import_runningHours", (mediawikiIndex.job.runningTime() / 60) / 60);
+ prop.put("import_runningMinutes", (mediawikiIndex.job.runningTime() / 60) % 60);
+ prop.put("import_remainingHours", (mediawikiIndex.job.remainingTime() / 60) / 60);
+ prop.put("import_remainingMinutes", (mediawikiIndex.job.remainingTime() / 60) % 60);
} else {
prop.put("import", 0);
if (post == null) {
@@ -62,7 +68,13 @@ public class IndexImportWikimedia_p {
mediawikiIndex.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
+ prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_count", 0);
+ prop.put("import_speed", 0);
+ prop.put("import_runningHours", 0);
+ prop.put("import_runningMinutes", 0);
+ prop.put("import_remainingHours", 0);
+ prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) {
e.printStackTrace();
prop.put("import", 0);
diff --git a/source/de/anomic/data/wiki/wikiCode.java b/source/de/anomic/data/wiki/wikiCode.java
index 2e41b574d..71b03daee 100644
--- a/source/de/anomic/data/wiki/wikiCode.java
+++ b/source/de/anomic/data/wiki/wikiCode.java
@@ -548,6 +548,7 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
break;
}
element = dirElements.get(i);
+ if (element == null) continue;
//counting double headlines
doubles = 0;
for (int j = 0; j < i; j++) {
diff --git a/source/de/anomic/plasma/parser/Condenser.java b/source/de/anomic/plasma/parser/Condenser.java
index a3aa6a3fc..fa16aff8c 100644
--- a/source/de/anomic/plasma/parser/Condenser.java
+++ b/source/de/anomic/plasma/parser/Condenser.java
@@ -600,8 +600,12 @@ public final class Condenser {
}
static StringBuilder trim(StringBuilder sb) {
- while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
- while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
+ int i = 0;
+ while (i < sb.length() && sb.charAt(i) <= ' ') i++;
+ if (i > 0) sb.delete(0, i);
+ i = sb.length() - 1;
+ while (i >= 0 && i < sb.length() && sb.charAt(i) <= ' ') i--;
+ if (i > 0) sb.delete(i + 1, sb.length());
return sb;
}
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index e0e0b8a1b..a3e0f3205 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -508,14 +508,14 @@ public final class plasmaWordIndex {
} else {
// here we have three results: we can do a voting
if (language.equals(bymetadata)) {
- System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
+ //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
} else if (language.equals(entry.url().language())) {
- System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
+ //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
} else if (bymetadata.equals(entry.url().language())) {
- System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
+ //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
language = bymetadata;
} else {
- System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
+ //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
language = bymetadata;
}
}
diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java
index 5722fbbec..19d3eed21 100644
--- a/source/de/anomic/tools/mediawikiIndex.java
+++ b/source/de/anomic/tools/mediawikiIndex.java
@@ -82,30 +82,59 @@ public class mediawikiIndex extends Thread {
private wikiParser wparser;
private plasmaParser hparser;
private String urlStub;
- private File sourcefile;
- private File targetdir;
+ public File sourcefile;
+ public File targetdir;
public int count;
+ private long start;
+ private long docsize;
+ private int approxdocs;
+
+ private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
this.sourcefile = sourcefile;
+ this.docsize = sourcefile.length();
+ this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir;
this.urlStub = baseURL;
this.wparser = new wikiCode(new URL(baseURL).getHost());
this.hparser = new plasmaParser();
this.count = 0;
+ this.start = 0;
// must be called before usage:
plasmaParser.initHTMLParsableMimeTypes("text/html");
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
}
+ /**
+ * return the number of articles per second
+ * @return
+ */
+ public int speed() {
+ if (count == 0) return 0;
+ return (int) ((long) count / runningTime());
+ }
+
+ /**
+ * return the remaining seconds for the completion of all records in milliseconds
+ * @return
+ */
+ public long remainingTime() {
+ return Math.max(0, this.approxdocs - count) / speed();
+ }
+
+ public long runningTime() {
+ return (System.currentTimeMillis() - start) / 1024;
+ }
public void run() {
+ this.start = System.currentTimeMillis();
try {
String targetstub = sourcefile.getName();
targetstub = targetstub.substring(0, targetstub.length() - 8);
- InputStream is = new FileInputStream(sourcefile);
+ InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1 * 1024 * 1024);
if (sourcefile.getName().endsWith(".bz2")) {
int b = is.read();
if (b != 'B') throw new IOException("Invalid bz2 content.");
@@ -113,7 +142,7 @@ public class mediawikiIndex extends Thread {
if (b != 'Z') throw new IOException("Invalid bz2 content.");
is = new CBZip2InputStream(is);
}
- BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024);
+ BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
String t;
StringBuilder sb = new StringBuilder();
boolean page = false, text = false;