enhanced the wikimedia dump import process

enhanced the wiki parser and condenser speed

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5931 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 5fb77116c6
commit 89aeb318d3

@ -12,23 +12,23 @@
#(import)#
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
<form action="IndexImportWikimedia_p.html" method="get" id="importwiki" accept-charset="UTF-8">
<form action="IndexImportWikimedia_p.html" method="get">
<!-- no post method here, we don't want to transmit the whole file, only the path-->
<fieldset>
<legend>Wikimedia Dump File Selection: select a 'bz2' file</legend>
You can import Wikipedia dumps here. An example is the file
<a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2">
http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>.
<br>
<br />
Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading!
<br>
<input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" accept="application/x-bzip2">
<br />
<input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" />
<input name="submit" type="submit" value="Import Wikimedia Dump" />
</fieldset>
</form>
<p>
When the import is started, the following happens:
<ul>
</p><ul>
<li>The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this:
<pre>
&lt;?xml version="1.0" encoding="utf-8"?&gt;
@ -52,14 +52,18 @@
<li>When a surrogate file is finished with indexing, it is moved to /DATA/SURROGATES/out</li>
<li>You can recycle processed surrogate files by moving them from /DATA/SURROGATES/out to /DATA/SURROGATES/in</li>
</ul>
</p>
<p></p>
::
<fieldset><legend>Import Process</legend>
<form><fieldset><legend>Import Process</legend>
<dl>
<dt>Thread: #[thread]#</dt>
<dt>Processed Wiki Entries: #[count]#</dt>
<dt>Thread:</dt><dd>#[thread]#</dd>
<dt>Dump:</dt><dd>#[dump]#</dd>
<dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
<dt>Speed:</dt><dd>#[speed]# articles per second</dd>
<dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
<dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
</dl>
</fieldset>
</fieldset></form>
#(/import)#
#%env/templates/footer.template%#

@ -41,7 +41,13 @@ public class IndexImportWikimedia_p {
// one import is running, no option to insert anything
prop.put("import", 1);
prop.put("import_thread", "running");
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_count", mediawikiIndex.job.count);
prop.put("import_speed", mediawikiIndex.job.speed());
prop.put("import_runningHours", (mediawikiIndex.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (mediawikiIndex.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (mediawikiIndex.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (mediawikiIndex.job.remainingTime() / 60) % 60);
} else {
prop.put("import", 0);
if (post == null) {
@ -62,7 +68,13 @@ public class IndexImportWikimedia_p {
mediawikiIndex.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) {
e.printStackTrace();
prop.put("import", 0);

@ -548,6 +548,7 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
break;
}
element = dirElements.get(i);
if (element == null) continue;
//counting double headlines
doubles = 0;
for (int j = 0; j < i; j++) {

@ -600,8 +600,12 @@ public final class Condenser {
}
static StringBuilder trim(StringBuilder sb) {
while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
int i = 0;
while (i < sb.length() && sb.charAt(i) <= ' ') i++;
if (i > 0) sb.delete(0, i);
i = sb.length() - 1;
while (i >= 0 && i < sb.length() && sb.charAt(i) <= ' ') i--;
if (i > 0) sb.delete(i + 1, sb.length());
return sb;
}

@ -508,14 +508,14 @@ public final class plasmaWordIndex {
} else {
// here we have three results: we can do a voting
if (language.equals(bymetadata)) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
} else if (language.equals(entry.url().language())) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
} else if (bymetadata.equals(entry.url().language())) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
language = bymetadata;
} else {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
//System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
language = bymetadata;
}
}

@ -82,30 +82,59 @@ public class mediawikiIndex extends Thread {
private wikiParser wparser;
private plasmaParser hparser;
private String urlStub;
private File sourcefile;
private File targetdir;
public File sourcefile;
public File targetdir;
public int count;
private long start;
private long docsize;
private int approxdocs;
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir;
this.urlStub = baseURL;
this.wparser = new wikiCode(new URL(baseURL).getHost());
this.hparser = new plasmaParser();
this.count = 0;
this.start = 0;
// must be called before usage:
plasmaParser.initHTMLParsableMimeTypes("text/html");
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
}
/**
* return the number of articles per second
* @return
*/
public int speed() {
if (count == 0) return 0;
return (int) ((long) count / runningTime());
}
/**
* return the remaining seconds for the completion of all records in milliseconds
* @return
*/
public long remainingTime() {
return Math.max(0, this.approxdocs - count) / speed();
}
public long runningTime() {
return (System.currentTimeMillis() - start) / 1024;
}
public void run() {
this.start = System.currentTimeMillis();
try {
String targetstub = sourcefile.getName();
targetstub = targetstub.substring(0, targetstub.length() - 8);
InputStream is = new FileInputStream(sourcefile);
InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1 * 1024 * 1024);
if (sourcefile.getName().endsWith(".bz2")) {
int b = is.read();
if (b != 'B') throw new IOException("Invalid bz2 content.");
@ -113,7 +142,7 @@ public class mediawikiIndex extends Thread {
if (b != 'Z') throw new IOException("Invalid bz2 content.");
is = new CBZip2InputStream(is);
}
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024);
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
String t;
StringBuilder sb = new StringBuilder();
boolean page = false, text = false;

Loading…
Cancel
Save