From 5fb77116c68386238daf96e06b2bdb3220621a91 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 8 May 2009 07:54:10 +0000 Subject: [PATCH] added a submenu to index administration to import a wikimedia dump (i.e. a dump from wikipedia) into the YaCy index: see http://localhost:8080/IndexImportWikimedia_p.html git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5930 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexImportWikimedia_p.html | 67 +++++ htroot/IndexImportWikimedia_p.java | 78 +++++ .../templates/submenuIndexControl.template | 1 + source/de/anomic/kelondro/text/IndexCell.java | 2 +- .../anomic/plasma/plasmaParserDocument.java | 6 +- source/de/anomic/tools/mediawikiIndex.java | 284 ++++++++++-------- 6 files changed, 318 insertions(+), 120 deletions(-) create mode 100644 htroot/IndexImportWikimedia_p.html create mode 100644 htroot/IndexImportWikimedia_p.java diff --git a/htroot/IndexImportWikimedia_p.html b/htroot/IndexImportWikimedia_p.html new file mode 100644 index 000000000..104fb84f3 --- /dev/null +++ b/htroot/IndexImportWikimedia_p.html @@ -0,0 +1,67 @@ + + + + YaCy '#[clientname]#': Wikimedia Dump Import + #%env/templates/metas.template%# + #(import)#::#(/import)# + + + #%env/templates/header.template%# + #%env/templates/submenuIndexControl.template%# +

Wikimedia Dump Import

+ + #(import)# +

#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#

+
+ +
+ Wikimedia Dump File Selection: select a 'bz2' file + You can import Wikipedia dumps here. An example is the file + + http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2. +
+ Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading! +
+ + +
+
+

+ When the import is started, the following happens: +

+

+ :: +
Import Process +
+
Thread: #[thread]#
+
Processed Wiki Entries: #[count]#
+
+
+ #(/import)# + + #%env/templates/footer.template%# + + \ No newline at end of file diff --git a/htroot/IndexImportWikimedia_p.java b/htroot/IndexImportWikimedia_p.java new file mode 100644 index 000000000..3da6cdeed --- /dev/null +++ b/htroot/IndexImportWikimedia_p.java @@ -0,0 +1,78 @@ +// IndexImportWikimedia.java +// ------------------------- +// (C) 2009 by Michael Peter Christen; mc@yacy.net +// first published 04.05.2009 on http://yacy.net +// Frankfurt, Germany +// +// $LastChangedDate: 2009-04-16 17:29:00 +0200 (Do, 16 Apr 2009) $ +// $LastChangedRevision: 5812 $ +// $LastChangedBy: orbiter $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import java.io.File; +import java.net.MalformedURLException; + +import de.anomic.http.httpRequestHeader; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; +import de.anomic.tools.mediawikiIndex; + +public class IndexImportWikimedia_p { + + public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch env) { + final serverObjects prop = new serverObjects(); + final plasmaSwitchboard sb = (plasmaSwitchboard) env; + + if (mediawikiIndex.job != null && mediawikiIndex.job.isAlive()) { + // one import is running, no option to insert anything + prop.put("import", 1); + prop.put("import_thread", "running"); + prop.put("import_count", mediawikiIndex.job.count); + } else { + prop.put("import", 0); + if (post == null) { + prop.put("import_status", 0); + } else { + if (post.containsKey("file")) { + File sourcefile = new File(post.get("file")); + String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2 + if (!name.endsWith("pages-articles.xml.bz2")) { + prop.put("import", 0); + prop.put("import_status", 1); + prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'"); + return prop; + } + String lang = name.substring(0, 2); + try { + mediawikiIndex.job = new mediawikiIndex(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/"); + mediawikiIndex.job.start(); + prop.put("import", 1); + prop.put("import_thread", "started"); + prop.put("import_count", 0); + } catch (MalformedURLException e) { + e.printStackTrace(); + prop.put("import", 0); + prop.put("import_status", 1); + prop.put("import_status_message", e.getMessage()); + } + } + return prop; + } + } + return prop; + } +} diff --git a/htroot/env/templates/submenuIndexControl.template b/htroot/env/templates/submenuIndexControl.template index 599f91846..7b0759dad 100644 --- a/htroot/env/templates/submenuIndexControl.template +++ b/htroot/env/templates/submenuIndexControl.template @@ -6,5 +6,6 @@
  • Queue Import
  • Index Transfer
  • Index Cleaner
  • +
  • Wikimedia Dump Import
  • diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java index 11a697419..cbd8b4652 100644 --- a/source/de/anomic/kelondro/text/IndexCell.java +++ b/source/de/anomic/kelondro/text/IndexCell.java @@ -168,7 +168,7 @@ public final class IndexCell extends AbstractBu if (c0 == null) return c1; return c1.merge(c0); } - + /** * remove url references from a selected word hash. this deletes also in the BLOB * files, which means that there exists new gap entries after the deletion diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 187d5cff5..774920dc0 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -55,7 +55,7 @@ public class plasmaParserDocument { private final String mimeType; // mimeType as taken from http header private final String charset; // the charset of the document private final List keywords; // most resources provide a keyword field - private final StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result + private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result private final StringBuilder creator; // author or copyright private final List sections; // if present: more titles/headlines appearing in the document private final StringBuilder description; // an abstract, if present: short content description @@ -173,6 +173,10 @@ dc_rights return title.toString(); } + public void setTitle(String title) { + this.title = new StringBuilder(title); + } + public String dc_creator() { if (creator == null) return ""; diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java index 99288e5a7..5722fbbec 100644 --- a/source/de/anomic/tools/mediawikiIndex.java +++ b/source/de/anomic/tools/mediawikiIndex.java @@ -59,6 +59,7 @@ import java.util.concurrent.TimeoutException; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; import de.anomic.kelondro.util.ByteBuffer; +import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.ParserException; @@ -69,7 +70,7 @@ import de.anomic.yacy.yacyURL; * as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/" */ -public class mediawikiIndex { +public class mediawikiIndex extends Thread { private static final String textstart = " in = new ArrayBlockingQueue(threads * 10); + BlockingQueue out = new ArrayBlockingQueue(threads * 10); + ExecutorService service = Executors.newFixedThreadPool(threads + 1); + convertConsumer[] consumers = new convertConsumer[threads]; + Future[] consumerResults = new Future[threads]; + for (int i = 0; i < threads; i++) { + consumers[i] = new convertConsumer(in, out, poison); + consumerResults[i] = service.submit(consumers[i]); + } + convertWriter writer = new convertWriter(out, poison, targetdir, targetstub); + Future writerResult = service.submit(writer); + + wikiparserrecord record; + int p; + while ((t = r.readLine()) != null) { + if (t.indexOf(pagestart) >= 0) { + page = true; + continue; + } + if ((p = t.indexOf(textstart)) >= 0) { + text = page; + int q = t.indexOf('>', p + textstart.length()); + if (q > 0) { + int u = t.indexOf(textend, q + 1); + if (u > q) { + sb.append(t.substring(q + 1, u)); + Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title); + if (sb.length() == 0) { + Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content"); + continue; + } + record = newRecord(title, sb); + try { + in.put(record); + this.count++; + } catch (InterruptedException e1) { + e1.printStackTrace(); + } + sb = new StringBuilder(200); + continue; + } else { + sb.append(t.substring(q + 1)); + } + } + continue; + } + if (t.indexOf(textend) >= 0) { + text = false; + Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title); + if (sb.length() == 0) { + Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content"); + continue; + } + record = newRecord(title, sb); + try { + in.put(record); + this.count++; + } catch (InterruptedException e1) { + e1.printStackTrace(); + } + sb = new StringBuilder(200); + continue; + } + if (t.indexOf(pageend) >= 0) { + page = false; + continue; + } + if ((p = t.indexOf("")) >= 0) { + title = t.substring(p + 7); + int q = title.indexOf(""); + if (q >= 0) title = title.substring(0, q); + continue; + } + if (text) { + sb.append(t); + sb.append('\n'); + } + } + r.close(); + + try { + for (int i = 0; i < threads; i++) { + in.put(poison); + } + for (int i = 0; i < threads; i++) { + consumerResults[i].get(10000, TimeUnit.MILLISECONDS); + } + out.put(poison); + writerResult.get(10000, TimeUnit.MILLISECONDS); + } catch (InterruptedException e1) { + e1.printStackTrace(); + } catch (ExecutionException e) { + e.printStackTrace(); + } catch (TimeoutException e) { + e.printStackTrace(); + } catch (Exception e) { + e.printStackTrace(); + } + } catch (IOException e) { + e.printStackTrace(); + } catch (Exception e) { + e.printStackTrace(); + } + } + public static void checkIndex(File wikimediaxml) { File idx = idxFromWikimediaXML(wikimediaxml); if (idx.exists()) return; @@ -188,13 +324,13 @@ public class mediawikiIndex { while(true) { r = entries.take(); if (r == poison) { - System.out.println("producer / got poison"); + Log.logInfo("WIKITRANSLATION", "producer / got poison"); break; } out.println(" "); out.println(" " + r.title + ""); out.println(" "); - System.out.println("producer / record start: " + r.start + ", title : " + r.title); + Log.logInfo("WIKITRANSLATION", "producer / record start: " + r.start + ", title : " + r.title); count++; } } catch (InterruptedException e) { @@ -236,13 +372,13 @@ public class mediawikiIndex { while(true) { c = entries.take(); if (c == poison) { - System.out.println("consumer / got poison"); + Log.logInfo("WIKITRANSLATION", "consumer / got poison"); break; } try { r = new wikisourcerecord(c.b, c.start, c.end); producer.consume(r); - System.out.println("consumer / record start: " + r.start + ", title : " + r.title); + Log.logInfo("WIKITRANSLATION", "consumer / record start: " + r.start + ", title : " + r.title); count++; } catch (RuntimeException e) {} } @@ -325,6 +461,8 @@ public class mediawikiIndex { try { url = new yacyURL(urlStub + title, null); document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes("UTF-8")); + // the wiki parser is not able to find the proper title in the source text, so it must be set here + document.setTitle(title); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (MalformedURLException e1) { @@ -414,7 +552,7 @@ public class mediawikiIndex { in.resetBuffer(); if (s.indexOf(m) >= 0) { // we found the record - //System.out.println("s = " + s); + //Log.logInfo("WIKITRANSLATION", "s = " + s); int p = s.indexOf("start=\""); if (p < 0) return null; p += 7; @@ -427,7 +565,7 @@ public class mediawikiIndex { q = s.indexOf('"', p + 1); if (q < 0) return null; int length = Integer.parseInt(s.substring(p, q)); - //System.out.println("start = " + start + ", length = " + length); + //Log.logInfo("WIKITRANSLATION", "start = " + start + ", length = " + length); return new wikisourcerecord(title, start, start + length); } } @@ -451,7 +589,7 @@ public class mediawikiIndex { while(true) { record = in.take(); if (record == poison) { - System.out.println("convertConsumer / got poison"); + Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison"); break; } try { @@ -470,7 +608,7 @@ public class mediawikiIndex { } catch (InterruptedException e) { e.printStackTrace(); } - System.out.println("*** convertConsumer has terminated"); + Log.logInfo("WIKITRANSLATION", "*** convertConsumer has terminated"); return Integer.valueOf(0); } @@ -507,7 +645,7 @@ public class mediawikiIndex { while(true) { record = in.take(); if (record == poison) { - System.out.println("convertConsumer / got poison"); + Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison"); break; } @@ -517,7 +655,7 @@ public class mediawikiIndex { this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8"); osw.write("\n\n"); } - System.out.println("[CONSUME] Title: " + record.title); + Log.logInfo("WIKITRANSLATION", "[CONSUME] Title: " + record.title); record.document.writeXML(osw, new Date()); rc++; if (rc >= 10000) { @@ -552,114 +690,19 @@ public class mediawikiIndex { e.printStackTrace(); } } - System.out.println("*** convertWriter has terminated"); + Log.logInfo("WIKITRANSLATION", "*** convertWriter has terminated"); return Integer.valueOf(0); } } - public static void convert(File sourcefile, File targetdir, String urlStub) throws IOException { - String targetstub = sourcefile.getName(); - targetstub = targetstub.substring(0, targetstub.length() - 8); - InputStream is = new FileInputStream(sourcefile); - if (sourcefile.getName().endsWith(".bz2")) { - int b = is.read(); - if (b != 'B') throw new IOException("Invalid bz2 content."); - b = is.read(); - if (b != 'Z') throw new IOException("Invalid bz2 content."); - is = new CBZip2InputStream(is); - } - BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024); - String t; - StringBuilder sb = new StringBuilder(); - boolean page = false, text = false; - String title = null; - plasmaParser.initHTMLParsableMimeTypes("text/html"); - plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html"); - mediawikiIndex mi = new mediawikiIndex(urlStub); - wikiparserrecord poison = mi.newRecord(); - int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1); - BlockingQueue in = new ArrayBlockingQueue(threads * 10); - BlockingQueue out = new ArrayBlockingQueue(threads * 10); - ExecutorService service = Executors.newFixedThreadPool(threads + 1); - convertConsumer[] consumers = new convertConsumer[threads]; - Future[] consumerResults = new Future[threads]; - for (int i = 0; i < threads; i++) { - consumers[i] = new convertConsumer(in, out, poison); - consumerResults[i] = service.submit(consumers[i]); - } - convertWriter writer = new convertWriter(out, poison, targetdir, targetstub); - Future writerResult = service.submit(writer); - - wikiparserrecord record; - while ((t = r.readLine()) != null) { - if (t.indexOf(pagestart) >= 0) { - page = true; - continue; - } - if (t.indexOf(textstart) >= 0) { - text = page; - continue; - } - if (t.indexOf(textend) >= 0) { - text = false; - System.out.println("[INJECT] Title: " + title); - if (sb.length() == 0) { - System.out.println("ERROR: " + title + " has empty content"); - continue; - } - record = mi.newRecord(title, sb); - try { - in.put(record); - } catch (InterruptedException e1) { - e1.printStackTrace(); - } - sb.setLength(0); - continue; - } - if (t.indexOf(pageend) >= 0) { - page = false; - continue; - } - if (t.indexOf("") >= 0) { - title = t.substring(t.indexOf("<title>") + 7); - int p = title.indexOf(""); - if (p >= 0) title = title.substring(0, p); - continue; - } - if (text) { - sb.append(t); - sb.append('\n'); - } - } - r.close(); - - try { - for (int i = 0; i < threads; i++) { - in.put(poison); - } - for (int i = 0; i < threads; i++) { - consumerResults[i].get(10000, TimeUnit.MILLISECONDS); - } - out.put(poison); - writerResult.get(10000, TimeUnit.MILLISECONDS); - } catch (InterruptedException e1) { - e1.printStackTrace(); - } catch (ExecutionException e) { - e.printStackTrace(); - } catch (TimeoutException e) { - e.printStackTrace(); - } - - } - public static void main(String[] s) { if (s.length == 0) { - System.out.println("usage:"); - System.out.println(" -index "); - System.out.println(" -read "); - System.out.println(" -find <wikipedia-dump>"); - System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>"); + Log.logInfo("WIKITRANSLATION", "usage:"); + Log.logInfo("WIKITRANSLATION", " -index <wikipedia-dump>"); + Log.logInfo("WIKITRANSLATION", " -read <start> <len> <idx-file>"); + Log.logInfo("WIKITRANSLATION", " -find <title> <wikipedia-dump>"); + Log.logInfo("WIKITRANSLATION", " -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>"); System.exit(0); } @@ -672,7 +715,11 @@ public class mediawikiIndex { String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/ //String language = urlStub.substring(7,9); try { - convert(sourcefile, targetdir, urlStub); + mediawikiIndex mi = new mediawikiIndex(sourcefile, targetdir, urlStub); + mi.start(); + mi.join(); + } catch (InterruptedException e) { + e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } @@ -700,7 +747,7 @@ public class mediawikiIndex { try { wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml")); if (w == null) { - System.out.println("not found"); + Log.logInfo("WIKITRANSLATION", "not found"); } else { System.out.println(new String(read(new File(s[2]), w.start, (int) (w.end - w.start)), "UTF-8")); } @@ -709,6 +756,7 @@ public class mediawikiIndex { } } + System.exit(0); } }