From 5fb77116c68386238daf96e06b2bdb3220621a91 Mon Sep 17 00:00:00 2001
From: orbiter
Date: Fri, 8 May 2009 07:54:10 +0000
Subject: [PATCH] added a submenu to index administration to import a wikimedia
dump (i.e. a dump from wikipedia) into the YaCy index: see
http://localhost:8080/IndexImportWikimedia_p.html
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5930 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/IndexImportWikimedia_p.html | 67 +++++
htroot/IndexImportWikimedia_p.java | 78 +++++
.../templates/submenuIndexControl.template | 1 +
source/de/anomic/kelondro/text/IndexCell.java | 2 +-
.../anomic/plasma/plasmaParserDocument.java | 6 +-
source/de/anomic/tools/mediawikiIndex.java | 284 ++++++++++--------
6 files changed, 318 insertions(+), 120 deletions(-)
create mode 100644 htroot/IndexImportWikimedia_p.html
create mode 100644 htroot/IndexImportWikimedia_p.java
diff --git a/htroot/IndexImportWikimedia_p.html b/htroot/IndexImportWikimedia_p.html
new file mode 100644
index 000000000..104fb84f3
--- /dev/null
+++ b/htroot/IndexImportWikimedia_p.html
@@ -0,0 +1,67 @@
+
+
+
+ YaCy '#[clientname]#': Wikimedia Dump Import
+ #%env/templates/metas.template%#
+ #(import)#::#(/import)#
+
+
+ #%env/templates/header.template%#
+ #%env/templates/submenuIndexControl.template%#
+
Wikimedia Dump Import
+
+ #(import)#
+
#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#
+
+
+ When the import is started, the following happens:
+
+
The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this:
+
+ <?xml version="1.0" encoding="utf-8"?>
+<surrogates xmlns:dc="http://purl.org/dc/elements/1.1/">
+ <record>
+ <dc:Title><![CDATA[Alan Smithee]]></dc:Title>
+ <dc:Identifier>http://de.wikipedia.org/wiki/Alan%20Smithee</dc:Identifier>
+ <dc:Description><![CDATA[Der als Filmregisseur oft genannte Alan Smithee ist ein Anagramm]]></dc:Description>
+ <dc:Language>de</dc:Language>
+ <dc:Date>2009-05-07T06:03:48Z</dc:Date>
+ </record>
+ <record>
+ ...
+ </record>
+</surrogates>
+
+
+
Each 10000 wiki records are combined in one output file which is written to /DATA/SURROGATES/in into a temporary file.
+
When each of the generated output file is finished, it is renamed to a .xml file
+
Each time a xml surrogate file appears in /DATA/SURROGATES/in, the YaCy indexer fetches the file and indexes the record entries.
+
When a surrogate file is finished with indexing, it is moved to /DATA/SURROGATES/out
+
You can recycle processed surrogate files by moving them from /DATA/SURROGATES/out to /DATA/SURROGATES/in
+
+
+ ::
+
+ #(/import)#
+
+ #%env/templates/footer.template%#
+
+
\ No newline at end of file
diff --git a/htroot/IndexImportWikimedia_p.java b/htroot/IndexImportWikimedia_p.java
new file mode 100644
index 000000000..3da6cdeed
--- /dev/null
+++ b/htroot/IndexImportWikimedia_p.java
@@ -0,0 +1,78 @@
+// IndexImportWikimedia.java
+// -------------------------
+// (C) 2009 by Michael Peter Christen; mc@yacy.net
+// first published 04.05.2009 on http://yacy.net
+// Frankfurt, Germany
+//
+// $LastChangedDate: 2009-04-16 17:29:00 +0200 (Do, 16 Apr 2009) $
+// $LastChangedRevision: 5812 $
+// $LastChangedBy: orbiter $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import java.io.File;
+import java.net.MalformedURLException;
+
+import de.anomic.http.httpRequestHeader;
+import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverObjects;
+import de.anomic.server.serverSwitch;
+import de.anomic.tools.mediawikiIndex;
+
+public class IndexImportWikimedia_p {
+
+ public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch> env) {
+ final serverObjects prop = new serverObjects();
+ final plasmaSwitchboard sb = (plasmaSwitchboard) env;
+
+ if (mediawikiIndex.job != null && mediawikiIndex.job.isAlive()) {
+ // one import is running, no option to insert anything
+ prop.put("import", 1);
+ prop.put("import_thread", "running");
+ prop.put("import_count", mediawikiIndex.job.count);
+ } else {
+ prop.put("import", 0);
+ if (post == null) {
+ prop.put("import_status", 0);
+ } else {
+ if (post.containsKey("file")) {
+ File sourcefile = new File(post.get("file"));
+ String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2
+ if (!name.endsWith("pages-articles.xml.bz2")) {
+ prop.put("import", 0);
+ prop.put("import_status", 1);
+ prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'");
+ return prop;
+ }
+ String lang = name.substring(0, 2);
+ try {
+ mediawikiIndex.job = new mediawikiIndex(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
+ mediawikiIndex.job.start();
+ prop.put("import", 1);
+ prop.put("import_thread", "started");
+ prop.put("import_count", 0);
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ prop.put("import", 0);
+ prop.put("import_status", 1);
+ prop.put("import_status_message", e.getMessage());
+ }
+ }
+ return prop;
+ }
+ }
+ return prop;
+ }
+}
diff --git a/htroot/env/templates/submenuIndexControl.template b/htroot/env/templates/submenuIndexControl.template
index 599f91846..7b0759dad 100644
--- a/htroot/env/templates/submenuIndexControl.template
+++ b/htroot/env/templates/submenuIndexControl.template
@@ -6,5 +6,6 @@