added a submenu to index administration to import a wikimedia dump (i.e. a dump from wikipedia) into the YaCy index: see
http://localhost:8080/IndexImportWikimedia_p.html git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5930 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
df733af4fa
commit
5fb77116c6
@ -0,0 +1,67 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': Wikimedia Dump Import</title>
|
||||
#%env/templates/metas.template%#
|
||||
#(import)#::<meta http-equiv="REFRESH" content="10" />#(/import)#
|
||||
</head>
|
||||
<body id="IndexImportWikimedia">
|
||||
#%env/templates/header.template%#
|
||||
#%env/templates/submenuIndexControl.template%#
|
||||
<h2>Wikimedia Dump Import</h2>
|
||||
|
||||
#(import)#
|
||||
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
|
||||
<form action="IndexImportWikimedia_p.html" method="get" id="importwiki" accept-charset="UTF-8">
|
||||
<!-- no post method here, we don't want to transmit the whole file, only the path-->
|
||||
<fieldset>
|
||||
<legend>Wikimedia Dump File Selection: select a 'bz2' file</legend>
|
||||
You can import Wikipedia dumps here. An example is the file
|
||||
<a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2">
|
||||
http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>.
|
||||
<br>
|
||||
Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading!
|
||||
<br>
|
||||
<input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" accept="application/x-bzip2">
|
||||
<input name="submit" type="submit" value="Import Wikimedia Dump" />
|
||||
</fieldset>
|
||||
</form>
|
||||
<p>
|
||||
When the import is started, the following happens:
|
||||
<ul>
|
||||
<li>The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this:
|
||||
<pre>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<surrogates xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<record>
|
||||
<dc:Title><![CDATA[Alan Smithee]]></dc:Title>
|
||||
<dc:Identifier>http://de.wikipedia.org/wiki/Alan%20Smithee</dc:Identifier>
|
||||
<dc:Description><![CDATA[Der als Filmregisseur oft genannte Alan Smithee ist ein Anagramm]]></dc:Description>
|
||||
<dc:Language>de</dc:Language>
|
||||
<dc:Date>2009-05-07T06:03:48Z</dc:Date>
|
||||
</record>
|
||||
<record>
|
||||
...
|
||||
</record>
|
||||
</surrogates>
|
||||
</pre>
|
||||
</li>
|
||||
<li>Each 10000 wiki records are combined in one output file which is written to /DATA/SURROGATES/in into a temporary file.</li>
|
||||
<li>When each of the generated output file is finished, it is renamed to a .xml file</li>
|
||||
<li>Each time a xml surrogate file appears in /DATA/SURROGATES/in, the YaCy indexer fetches the file and indexes the record entries.</li>
|
||||
<li>When a surrogate file is finished with indexing, it is moved to /DATA/SURROGATES/out</li>
|
||||
<li>You can recycle processed surrogate files by moving them from /DATA/SURROGATES/out to /DATA/SURROGATES/in</li>
|
||||
</ul>
|
||||
</p>
|
||||
::
|
||||
<fieldset><legend>Import Process</legend>
|
||||
<dl>
|
||||
<dt>Thread: #[thread]#</dt>
|
||||
<dt>Processed Wiki Entries: #[count]#</dt>
|
||||
</dl>
|
||||
</fieldset>
|
||||
#(/import)#
|
||||
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,78 @@
|
||||
// IndexImportWikimedia.java
|
||||
// -------------------------
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net
|
||||
// first published 04.05.2009 on http://yacy.net
|
||||
// Frankfurt, Germany
|
||||
//
|
||||
// $LastChangedDate: 2009-04-16 17:29:00 +0200 (Do, 16 Apr 2009) $
|
||||
// $LastChangedRevision: 5812 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import java.io.File;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.tools.mediawikiIndex;
|
||||
|
||||
public class IndexImportWikimedia_p {
|
||||
|
||||
public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch<?> env) {
|
||||
final serverObjects prop = new serverObjects();
|
||||
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
|
||||
|
||||
if (mediawikiIndex.job != null && mediawikiIndex.job.isAlive()) {
|
||||
// one import is running, no option to insert anything
|
||||
prop.put("import", 1);
|
||||
prop.put("import_thread", "running");
|
||||
prop.put("import_count", mediawikiIndex.job.count);
|
||||
} else {
|
||||
prop.put("import", 0);
|
||||
if (post == null) {
|
||||
prop.put("import_status", 0);
|
||||
} else {
|
||||
if (post.containsKey("file")) {
|
||||
File sourcefile = new File(post.get("file"));
|
||||
String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2
|
||||
if (!name.endsWith("pages-articles.xml.bz2")) {
|
||||
prop.put("import", 0);
|
||||
prop.put("import_status", 1);
|
||||
prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'");
|
||||
return prop;
|
||||
}
|
||||
String lang = name.substring(0, 2);
|
||||
try {
|
||||
mediawikiIndex.job = new mediawikiIndex(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
|
||||
mediawikiIndex.job.start();
|
||||
prop.put("import", 1);
|
||||
prop.put("import_thread", "started");
|
||||
prop.put("import_count", 0);
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
prop.put("import", 0);
|
||||
prop.put("import_status", 1);
|
||||
prop.put("import_status_message", e.getMessage());
|
||||
}
|
||||
}
|
||||
return prop;
|
||||
}
|
||||
}
|
||||
return prop;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue