*) Replaced occurrences of "Wikimedia" with "MediaWiki" where applicable. (Thanks to the folks of 0x20.be for pointing this out.)

*) Added description of where to place MediaWiki dump for import.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7905 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
low012 13 years ago
parent d40a177c05
commit 24e76a7b69

@ -1,3 +1,3 @@
#!/bin/bash
cd "`dirname $0`"
./apicall.sh /IndexImportWikimedia_p.html?file=$1 > /dev/null
./apicall.sh /IndexImportMediawiki_p.html?file=$1 > /dev/null

@ -75,7 +75,7 @@
::
<h2>(7) Results from surrogates import</h2>
<p>These records had been imported from surrogate files in DATA/SURROGATES/in</p>
<p><em>Use Case:</em> place files with dublin core metadata content into DATA/SURROGATES/in or use an index import method (i.e. <a href="/IndexImportWikimedia_p.html">wikimedia import</a>, <a href="/IndexImportOAIPMH_p.html">OAI-PMH retrieval</a>)</p>
<p><em>Use Case:</em> place files with dublin core metadata content into DATA/SURROGATES/in or use an index import method (i.e. <a href="/IndexImportMediawiki_p.html">MediaWiki import</a>, <a href="/IndexImportOAIPMH_p.html">OAI-PMH retrieval</a>)</p>
#(/process)#

@ -1,29 +1,29 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Wikimedia Dump Import</title>
<title>YaCy '#[clientname]#': MediaWiki Dump Import</title>
#%env/templates/metas.template%#
#(import)#::<meta http-equiv="REFRESH" content="10" />#(/import)#
</head>
<body id="IndexImportWikimedia">
<body id="IndexImportMediawiki">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Wikimedia Dump Import</h2>
<h2>MediaWiki Dump Import</h2>
#(import)#
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
<form action="IndexImportWikimedia_p.html" method="get" accept-charset="UTF-8">
<form action="IndexImportMediawiki_p.html" method="get" accept-charset="UTF-8">
<!-- no post method here, we don't want to transmit the whole file, only the path-->
<fieldset>
<legend>Wikimedia Dump File Selection: select a xml file (which may be bz2- or gz-encoded)</legend>
You can import Wikipedia dumps here. An example is the file
<a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2">
http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>.
<legend>MediaWiki Dump File Selection: select a xml file (which may be bz2- or gz-encoded)</legend>
You can import MediaWiki dumps here. An example is the file
<a href="http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2">
http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2</a>.
<br />
Dumps must be in XML format and may be compressed in gz or bz2. Uncompressed XML is also ok.
Dumps must be in XML format and may be compressed in gz or bz2. Place the file in the YaCy folder or in one of its sub-folders.
<br />
<input name="file" type="text" value="" size="80" />
<input name="submit" type="submit" value="Import Wikimedia Dump" />
<input name="submit" type="submit" value="Import MediaWiki Dump" />
</fieldset>
</form>
<p>

@ -1,4 +1,4 @@
// IndexImportWikimedia.java
// IndexImportMediawiki.java
// -------------------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 04.05.2009 on http://yacy.net
@ -31,7 +31,7 @@ import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class IndexImportWikimedia_p {
public class IndexImportMediawiki_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();

@ -8,7 +8,7 @@
<li><a href="/CrawlStartSite_p.html" class="MenuItemLink lock">Full Site Crawl/<br/>Sitemap Loader</a></li>
<li><a href="/CrawlStartExpert_p.html" class="MenuItemLink lock">Crawl Start<br/>(Expert)</a></li>
<li><a href="/CrawlStartScanner_p.html" class="MenuItemLink lock">Network<br/>Scanner</a></li>
<li><a href="/Load_MediawikiWiki.html" class="MenuItemLink">Crawling of<br/>Media Wikis</a></li>
<li><a href="/Load_MediawikiWiki.html" class="MenuItemLink">Crawling of<br/>MediaWikis</a></li>
<li><a href="/Load_PHPBB3.html" class="MenuItemLink">Crawling of<br/>phpBB3 Forums</a></li>
</ul>
</div>
@ -33,7 +33,7 @@
<h3>Database Reader</h3>
<ul class="SubMenu">
<li><a href="/ContentIntegrationPHPBB3_p.html" class="MenuItemLink lock">Database Reader<br/>for phpBB3 Forums</a></li>
<li><a href="/IndexImportWikimedia_p.html" class="MenuItemLink lock">Dump Reader for <br/>Wikimedia dumps</a></li>
<li><a href="/IndexImportMediawiki_p.html" class="MenuItemLink lock">Dump Reader for <br/>MediaWiki dumps</a></li>
</ul>
</div>
</div>

@ -56,7 +56,7 @@ public class mediawiki_p {
File dumpFile = new File(sb.getDataPath(), "DATA/HTCACHE/mediawiki/" + dump);
if (!dumpFile.exists()) return post;
MediawikiImporter.checkIndex(dumpFile);
MediawikiImporter.wikisourcerecord w = MediawikiImporter.find(title.replaceAll(" ", "_"), MediawikiImporter.idxFromWikimediaXML(dumpFile));
MediawikiImporter.wikisourcerecord w = MediawikiImporter.find(title.replaceAll(" ", "_"), MediawikiImporter.idxFromMediawikiXML(dumpFile));
if (w == null) {
return post;
}

@ -936,7 +936,7 @@ Statistics about \#\[domains\]\# domains in this stack:==Statistiken über #[dom
\(7\) Results from surrogates import==\(7\) Ergebnisse aus dem Surrogat Import
These records had been imported from surrogate files in DATA/SURROGATES/in==Diese Datensätze wurden aus Surrogat Dateien in DATA/SURROGATES/in importiert
<em>Use Case:</em> place files with dublin core metadata content into DATA/SURROGATES/in or use an index import method==<em>Anwendungsfall:</em> Dateien mit Dublin Core Metadaten Inhalt in das DATA/SURROGATES/in kopieren oder eine der Index Import Funktionen nutzen
\(i.e. <a href="/IndexImportWikimedia_p.html">wikimedia import</a>, <a href="/IndexImportOAIPMH_p.html">OAI-PMH retrieval</a>\)==(z.B. <a href="/IndexImportWikimedia_p.html">WikiMedia Dump Import</a>, <a href="/IndexImportOAIPMH_p.html">OAI-PMH Import</a>\)
\(i.e. <a href="/IndexImportMediawiki_p.html">MediaWiki import</a>, <a href="/IndexImportOAIPMH_p.html">OAI-PMH retrieval</a>\)==(z.B. <a href="/IndexImportMediawiki_p.html">MediaWiki Dump Import</a>, <a href="/IndexImportOAIPMH_p.html">OAI-PMH Import</a>\)
#Domain==Domain
#URLs=URLs
"delete all"=="Alle Löschen"
@ -1661,15 +1661,15 @@ The crawling queue==Der Crawler-Puffer
Various stack files that belong to the crawling queue==Verschiedene Stack-Dateien, die zum Crawler-Puffer gehören
#-----------------------------
#File: IndexImportWikimedia_p.html
#File: IndexImportMediawiki_p.html
#---------------------------
#Wikimedia Dump Import==Wikimedia Dump Import
#MediaWiki Dump Import==MediaWiki Dump Import
No import thread is running, you can start a new thread here==Sie können hier einen neuen Thread starten, da aktuell kein Import Thread läuft
Bad input data:==Ungültige Eingabedaten:
Wikimedia Dump File Selection: select a \'bz2\' file==Wikimedia Dump Datei Auswahl: Wähle eine 'bz2' Datei aus
You can import Wikipedia dumps here. An example is the file==Hier können Sie Wikimedia Dumps importieren. Als Beispiel dient die Datei
Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading!==Dumps müssen im XML Format vorliegen und bz2 komprimiert sein. Entpacken Sie die Datei nicht nach dem Herunterladen!
"Import Wikimedia Dump"=="Importiere Wikimedia Dump"
MediaWiki Dump File Selection: select a \'bz2\' file==MediaWiki Dump Datei Auswahl: Wähle eine 'bz2' Datei aus
You can import MediaWiki dumps here. An example is the file==Hier können Sie MediaWiki Dumps importieren. Als Beispiel dient die Datei
Dumps must be in XML format and may be compressed in gz or bz2. Place the file in the YaCy folder or in one of its sub-folders.==Dumps müssen im XML Format vorliegen und bz2 komprimiert sein. Legen Sie die Datei im YaCy-Verzeichnis oder einem Unterordner ab.
"Import MediaWiki Dump"=="Importiere MediaWiki Dump"
When the import is started, the following happens:==Wenn der Import gestartet wird passiert Folgendes:
The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this:==Der Dump wird zur Laufzeit extrahiert und die Wiki Einträge werden in das Dublin Core Datenformat übersetzt. Die Ausgabe schaut wie folgt aus:
Each 10000 wiki records are combined in one output file which is written to /DATA/SURROGATES/in into a temporary file.==Je 10000 Wiki Einträge werden zusammen in eine Ausgabedatei geschrieben und in /DATA/SURROGATES/in temporär gespeichert.
@ -3381,7 +3381,7 @@ Advanced Properties==Erweiterte Konfiguration
#---------------------------
External Content Integration==Integration von externen Inhalten
Import phpBB3 forum==Importiere phpBB3 Forum
Import Wikimedia dumps==Importiere Wikimedia Dumps
Import Mediawiki dumps==Importiere Mediawiki Dumps
Import OAI-PMH Sources==Importiere OAI-PMH Quellen
#-----------------------------
@ -3451,7 +3451,7 @@ Crawl Start<br/>\(Expert\)==Crawl Start<br/>(Experte)
Network<br/>Scanner==Netzwerk<br/>Scanner
#>Intranet<br/>Scanner<==>Intranet<br/>Scanner<
Crawling of==Crawlen von
#Media Wikis==Media Wikis
#MediaWikis==MediaWikis
>phpBB3 Forums<==>phpBB3 Foren<
Content Import<==Content Importer<
Network Harvesting<==Netzwerk Harvesting<
@ -3460,7 +3460,7 @@ Network Harvesting<==Netzwerk Harvesting<
Database Reader<==Datenbank Leser<
for phpBB3 Forums==für phpBB3 Foren
Dump Reader for==Dump Leser für
#Wikimedia dumps==Wikimedia dumps
#MediaWiki dumps==MediaWiki dumps
#-----------------------------
#File: env/templates/submenuPortalIntegration.template

@ -77,7 +77,7 @@ public class MediawikiImporter extends Thread implements Importer {
private static final String pageend = "</page>";
private static final byte[] pagestartb = UTF8.getBytes(pagestart);
private static final byte[] pageendb = UTF8.getBytes(pageend);
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 mediawiki dump
public static Importer job; // if started from a servlet, this object is used to store the thread
@ -274,22 +274,22 @@ public class MediawikiImporter extends Thread implements Importer {
}
}
public static void checkIndex(File wikimediaxml) {
File idx = idxFromWikimediaXML(wikimediaxml);
public static void checkIndex(File mediawikixml) {
File idx = idxFromMediawikiXML(mediawikixml);
if (idx.exists()) return;
new indexMaker(wikimediaxml).start();
new indexMaker(mediawikixml).start();
}
public static class indexMaker extends Thread {
File wikimediaxml;
public indexMaker(File wikimediaxml) {
this.wikimediaxml = wikimediaxml;
File mediawikixml;
public indexMaker(File mediawikixml) {
this.mediawikixml = mediawikixml;
}
public void run() {
try {
createIndex(this.wikimediaxml);
createIndex(this.mediawikixml);
} catch (final IOException e) {
} catch (final Exception e) {
Log.logException(e);
@ -297,8 +297,8 @@ public class MediawikiImporter extends Thread implements Importer {
}
}
public static File idxFromWikimediaXML(File wikimediaxml) {
return new File(wikimediaxml.getAbsolutePath() + ".idx.xml");
public static File idxFromMediawikiXML(File mediawikixml) {
return new File(mediawikixml.getAbsolutePath() + ".idx.xml");
}
public static void createIndex(File dumpFile) throws IOException {
@ -307,7 +307,7 @@ public class MediawikiImporter extends Thread implements Importer {
// init reader, producer and consumer
PositionAwareReader in = new PositionAwareReader(dumpFile);
indexProducer producer = new indexProducer(100, idxFromWikimediaXML(dumpFile));
indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile));
wikiConsumer consumer = new wikiConsumer(100, producer);
ExecutorService service = Executors.newFixedThreadPool(2);
Future<Integer> producerResult = service.submit(consumer);

Loading…
Cancel
Save