enhanced the wikimedia dump import process

enhanced the wiki parser and condenser speed

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5931 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 5fb77116c6
commit 89aeb318d3

@ -12,23 +12,23 @@
#(import)# #(import)#
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p> <p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
<form action="IndexImportWikimedia_p.html" method="get" id="importwiki" accept-charset="UTF-8"> <form action="IndexImportWikimedia_p.html" method="get">
<!-- no post method here, we don't want to transmit the whole file, only the path--> <!-- no post method here, we don't want to transmit the whole file, only the path-->
<fieldset> <fieldset>
<legend>Wikimedia Dump File Selection: select a 'bz2' file</legend> <legend>Wikimedia Dump File Selection: select a 'bz2' file</legend>
You can import Wikipedia dumps here. An example is the file You can import Wikipedia dumps here. An example is the file
<a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2"> <a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2">
http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>. http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>.
<br> <br />
Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading! Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading!
<br> <br />
<input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" accept="application/x-bzip2"> <input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" />
<input name="submit" type="submit" value="Import Wikimedia Dump" /> <input name="submit" type="submit" value="Import Wikimedia Dump" />
</fieldset> </fieldset>
</form> </form>
<p> <p>
When the import is started, the following happens: When the import is started, the following happens:
<ul> </p><ul>
<li>The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this: <li>The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this:
<pre> <pre>
&lt;?xml version="1.0" encoding="utf-8"?&gt; &lt;?xml version="1.0" encoding="utf-8"?&gt;
@ -52,14 +52,18 @@
<li>When a surrogate file is finished with indexing, it is moved to /DATA/SURROGATES/out</li> <li>When a surrogate file is finished with indexing, it is moved to /DATA/SURROGATES/out</li>
<li>You can recycle processed surrogate files by moving them from /DATA/SURROGATES/out to /DATA/SURROGATES/in</li> <li>You can recycle processed surrogate files by moving them from /DATA/SURROGATES/out to /DATA/SURROGATES/in</li>
</ul> </ul>
</p> <p></p>
:: ::
<fieldset><legend>Import Process</legend> <form><fieldset><legend>Import Process</legend>
<dl> <dl>
<dt>Thread: #[thread]#</dt> <dt>Thread:</dt><dd>#[thread]#</dd>
<dt>Processed Wiki Entries: #[count]#</dt> <dt>Dump:</dt><dd>#[dump]#</dd>
<dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
<dt>Speed:</dt><dd>#[speed]# articles per second</dd>
<dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
<dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
</dl> </dl>
</fieldset> </fieldset></form>
#(/import)# #(/import)#
#%env/templates/footer.template%# #%env/templates/footer.template%#

@ -41,7 +41,13 @@ public class IndexImportWikimedia_p {
// one import is running, no option to insert anything // one import is running, no option to insert anything
prop.put("import", 1); prop.put("import", 1);
prop.put("import_thread", "running"); prop.put("import_thread", "running");
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_count", mediawikiIndex.job.count); prop.put("import_count", mediawikiIndex.job.count);
prop.put("import_speed", mediawikiIndex.job.speed());
prop.put("import_runningHours", (mediawikiIndex.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (mediawikiIndex.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (mediawikiIndex.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (mediawikiIndex.job.remainingTime() / 60) % 60);
} else { } else {
prop.put("import", 0); prop.put("import", 0);
if (post == null) { if (post == null) {
@ -62,7 +68,13 @@ public class IndexImportWikimedia_p {
mediawikiIndex.job.start(); mediawikiIndex.job.start();
prop.put("import", 1); prop.put("import", 1);
prop.put("import_thread", "started"); prop.put("import_thread", "started");
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_count", 0); prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
e.printStackTrace(); e.printStackTrace();
prop.put("import", 0); prop.put("import", 0);

@ -548,6 +548,7 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
break; break;
} }
element = dirElements.get(i); element = dirElements.get(i);
if (element == null) continue;
//counting double headlines //counting double headlines
doubles = 0; doubles = 0;
for (int j = 0; j < i; j++) { for (int j = 0; j < i; j++) {

@ -600,8 +600,12 @@ public final class Condenser {
} }
static StringBuilder trim(StringBuilder sb) { static StringBuilder trim(StringBuilder sb) {
while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0); int i = 0;
while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1); while (i < sb.length() && sb.charAt(i) <= ' ') i++;
if (i > 0) sb.delete(0, i);
i = sb.length() - 1;
while (i >= 0 && i < sb.length() && sb.charAt(i) <= ' ') i--;
if (i > 0) sb.delete(i + 1, sb.length());
return sb; return sb;
} }

@ -508,14 +508,14 @@ public final class plasmaWordIndex {
} else { } else {
// here we have three results: we can do a voting // here we have three results: we can do a voting
if (language.equals(bymetadata)) { if (language.equals(bymetadata)) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language); //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
} else if (language.equals(entry.url().language())) { } else if (language.equals(entry.url().language())) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language); //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
} else if (bymetadata.equals(entry.url().language())) { } else if (bymetadata.equals(entry.url().language())) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")"); //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
language = bymetadata; language = bymetadata;
} else { } else {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata."); //System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
language = bymetadata; language = bymetadata;
} }
} }

@ -82,30 +82,59 @@ public class mediawikiIndex extends Thread {
private wikiParser wparser; private wikiParser wparser;
private plasmaParser hparser; private plasmaParser hparser;
private String urlStub; private String urlStub;
private File sourcefile; public File sourcefile;
private File targetdir; public File targetdir;
public int count; public int count;
private long start;
private long docsize;
private int approxdocs;
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException { public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
this.sourcefile = sourcefile; this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir; this.targetdir = targetdir;
this.urlStub = baseURL; this.urlStub = baseURL;
this.wparser = new wikiCode(new URL(baseURL).getHost()); this.wparser = new wikiCode(new URL(baseURL).getHost());
this.hparser = new plasmaParser(); this.hparser = new plasmaParser();
this.count = 0; this.count = 0;
this.start = 0;
// must be called before usage: // must be called before usage:
plasmaParser.initHTMLParsableMimeTypes("text/html"); plasmaParser.initHTMLParsableMimeTypes("text/html");
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html"); plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
} }
/**
* return the number of articles per second
* @return
*/
public int speed() {
if (count == 0) return 0;
return (int) ((long) count / runningTime());
}
/**
* return the remaining seconds for the completion of all records in milliseconds
* @return
*/
public long remainingTime() {
return Math.max(0, this.approxdocs - count) / speed();
}
public long runningTime() {
return (System.currentTimeMillis() - start) / 1024;
}
public void run() { public void run() {
this.start = System.currentTimeMillis();
try { try {
String targetstub = sourcefile.getName(); String targetstub = sourcefile.getName();
targetstub = targetstub.substring(0, targetstub.length() - 8); targetstub = targetstub.substring(0, targetstub.length() - 8);
InputStream is = new FileInputStream(sourcefile); InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1 * 1024 * 1024);
if (sourcefile.getName().endsWith(".bz2")) { if (sourcefile.getName().endsWith(".bz2")) {
int b = is.read(); int b = is.read();
if (b != 'B') throw new IOException("Invalid bz2 content."); if (b != 'B') throw new IOException("Invalid bz2 content.");
@ -113,7 +142,7 @@ public class mediawikiIndex extends Thread {
if (b != 'Z') throw new IOException("Invalid bz2 content."); if (b != 'Z') throw new IOException("Invalid bz2 content.");
is = new CBZip2InputStream(is); is = new CBZip2InputStream(is);
} }
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024); BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
String t; String t;
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
boolean page = false, text = false; boolean page = false, text = false;

Loading…
Cancel
Save