- fixed missing save operation for peer name change

- fixed import of mediawiki dump files
- added script to add mediawiki dump files

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7609 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 2b5f8585bf
commit a50f28e6e7

@ -0,0 +1,3 @@
#!/bin/bash
cd "`dirname $0`"
./apicall.sh /IndexImportWikimedia_p.html?file=$1 > /dev/null

@ -191,7 +191,7 @@ public class Blog {
prop.putHTML("mode_author", UTF8.String(author));
prop.putHTML("mode_subject", post.get("subject",""));
prop.put("mode_date", dateString(new Date()));
prop.putWiki("mode_page", post.get("content", ""));
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", post.get("content", ""));
prop.putHTML("mode_page-code", post.get("content", ""));
}
else {
@ -234,7 +234,7 @@ public class Blog {
else {
//only show 1 entry
prop.put("mode_entries", "1");
putBlogEntry(prop, page, address, 0, hasRights, xml);
putBlogEntry(sb, prop, page, address, 0, hasRights, xml);
}
}
@ -263,6 +263,7 @@ public class Blog {
while (i.hasNext() && (num == 0 || num > count)) {
if(0 < start--) continue;
putBlogEntry(
switchboard,
prop,
switchboard.blogDB.readBlogEntry(i.next()),
address,
@ -293,6 +294,7 @@ public class Blog {
}
private static serverObjects putBlogEntry(
final Switchboard sb,
final serverObjects prop,
final BlogBoard.BlogEntry entry,
final String address,
@ -324,7 +326,7 @@ public class Blog {
prop.put("mode_entries_" + number + "_page", entry.getPage());
prop.put("mode_entries_" + number + "_timestamp", entry.getTimestamp());
} else {
prop.putWiki("mode_entries_" + number + "_page", entry.getPage());
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_entries_" + number + "_page", entry.getPage());
}
if (hasRights) {

@ -175,7 +175,7 @@ public class BlogComments {
prop.putHTML("mode_allow_author", UTF8.String(author));
prop.putHTML("mode_subject", post.get("subject",""));
prop.put("mode_date", dateString(new Date()));
prop.putWiki("mode_page", post.get("content", ""));
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", post.get("content", ""));
prop.put("mode_page-code", post.get("content", ""));
} else {
// show blog-entry/entries
@ -191,7 +191,7 @@ public class BlogComments {
prop.putHTML("mode_allow_author", UTF8.String(author));
prop.put("mode_comments", page.getCommentsSize());
prop.put("mode_date", dateString(page.getDate()));
prop.putWiki("mode_page", page.getPage());
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", page.getPage());
if (hasRights) {
prop.put("mode_admin", "1");
prop.put("mode_admin_pageid", page.getKey());
@ -234,7 +234,7 @@ public class BlogComments {
if (!xml) {
prop.putHTML("mode_entries_"+count+"_subject", UTF8.String(entry.getSubject()));
prop.putHTML("mode_entries_"+count+"_author", UTF8.String(entry.getAuthor()));
prop.putWiki("mode_entries_"+count+"_page", entry.getPage());
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_entries_"+count+"_page", entry.getPage());
} else {
prop.putHTML("mode_entries_"+count+"_subject", UTF8.String(entry.getSubject()));
prop.putHTML("mode_entries_"+count+"_author", UTF8.String(entry.getAuthor()));

@ -103,11 +103,11 @@ public class ConfigBasic {
// check if peer name already exists
final yacySeed oldSeed = sb.peers.lookupByName(peerName);
if (oldSeed == null && !peerName.equals(sb.peers.mySeed().getName())) {
// the name is new
if (Pattern.compile("[A-Za-z0-9\\-_]{3,80}").matcher(peerName).matches()) {
sb.peers.mySeed().setName(peerName);
}
if (oldSeed == null &&
!peerName.equals(sb.peers.mySeed().getName()) &&
Pattern.compile("[A-Za-z0-9\\-_]{3,80}").matcher(peerName).matches()) {
sb.peers.mySeed().setName(peerName);
sb.peers.saveMySeed();
}
// UPnP config

@ -15,14 +15,14 @@
<form action="IndexImportWikimedia_p.html" method="get" accept-charset="UTF-8">
<!-- no post method here, we don't want to transmit the whole file, only the path-->
<fieldset>
<legend>Wikimedia Dump File Selection: select a 'bz2' file</legend>
<legend>Wikimedia Dump File Selection: select a xml file (which may be bz2- or gz-encoded)</legend>
You can import Wikipedia dumps here. An example is the file
<a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2">
http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>.
<br />
Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading!
Dumps must be in XML format and may be compressed in gz or bz2. Uncompressed XML is also ok.
<br />
<input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" />
<input name="file" type="text" value="" size="80" />
<input name="submit" type="submit" value="Import Wikimedia Dump" />
</fieldset>
</form>

@ -57,16 +57,17 @@ public class IndexImportWikimedia_p {
} else {
if (post.containsKey("file")) {
final File sourcefile = new File(post.get("file"));
final String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2
//final String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2
/*
if (!name.endsWith("pages-articles.xml.bz2")) {
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'");
return prop;
}
final String lang = name.substring(0, 2);
*/
try {
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");

@ -107,7 +107,7 @@ public class MessageSend_p {
prop.putXML("mode_permission_message", message);
prop.putHTML("mode_permission_hash", hash);
if (post.containsKey("preview")) {
prop.putWiki("mode_permission_previewmessage", message);
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_permission_previewmessage", message);
}

@ -160,7 +160,7 @@ public class Messages_p {
prop.putXML("mode_subject", message.subject());
String theMessage = null;
theMessage = UTF8.String(message.message());
prop.putWiki("mode_message", theMessage);
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_message", theMessage);
prop.put("mode_hash", message.authorHash());
prop.putXML("mode_key", key);
}

@ -162,7 +162,7 @@ public class ViewProfile {
prop.put("success_" + key, "1");
// only comments get "wikified"
if(key.equals("comment")){
prop.putWiki(
prop.putWiki(sb.peers.mySeed().getClusterAddress(),
"success_" + key + "_value",
entry.getValue().replaceAll("\r", "").replaceAll("\\\\n", "\n"));
prop.put("success_" + key + "_b64value", Base64Order.standardCoder.encodeString(entry.getValue()));

@ -152,7 +152,7 @@ public class Wiki {
prop.put("mode_display", display);
prop.putHTML("mode_author", author);
prop.put("mode_date", dateString(new Date()));
prop.putWiki("mode_page", post.get("content", ""));
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", post.get("content", ""));
prop.putHTML("mode_page-code", post.get("content", ""));
}
//end contrib of [MN]
@ -247,7 +247,7 @@ public class Wiki {
prop.put("mode_versioning_display", display);
prop.putHTML("mode_versioning_author", oentry.author());
prop.put("mode_versioning_date", dateString(oentry.date()));
prop.putWiki("mode_versioning_page", oentry.page());
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_versioning_page", oentry.page());
prop.putHTML("mode_versioning_page-code", UTF8.String(oentry.page()));
}
} catch (final IOException e) {
@ -263,7 +263,7 @@ public class Wiki {
prop.put("mode_display", display);
prop.putHTML("mode_author", page.author());
prop.put("mode_date", dateString(page.date()));
prop.putWiki("mode_page", page.page());
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", page.page());
prop.put("controls", "0");
prop.putHTML("controls_pagename", pagename);

@ -71,7 +71,7 @@ public class mediawiki_p {
page = page.substring(p, q);
prop.putHTML("title", title);
prop.putWiki("page", page);
prop.putWiki(sb.peers.mySeed().getClusterAddress(), "page", page);
return prop;
}

@ -34,17 +34,12 @@ import java.io.UnsupportedEncodingException;
abstract class AbstractWikiParser implements WikiParser {
final String address;
public AbstractWikiParser(final String address) {
this.address = address;
}
protected abstract String transform(BufferedReader reader, int length) throws IOException;
protected abstract String transform(String hostport, BufferedReader reader, int length) throws IOException;
public String transform(final String content) {
public String transform(String hostport, final String content) {
try {
return transform(
hostport,
new BufferedReader(new StringReader(content)),
content.length());
} catch (final IOException e) {
@ -52,9 +47,10 @@ abstract class AbstractWikiParser implements WikiParser {
}
}
public String transform(final String content, final String publicAddress) {
public String transform(String hostport, final String content, final String publicAddress) {
try {
return transform(
hostport,
new BufferedReader(new StringReader(content)),
content.length());
} catch (final IOException e) {
@ -62,14 +58,15 @@ abstract class AbstractWikiParser implements WikiParser {
}
}
public String transform(final byte[] content) throws UnsupportedEncodingException {
return transform(content, "UTF-8");
public String transform(String hostport, final byte[] content) throws UnsupportedEncodingException {
return transform(hostport, content, "UTF-8");
}
public String transform(final byte[] content, final String encoding, final String publicAddress) {
public String transform(String hostport, final byte[] content, final String encoding, final String publicAddress) {
final ByteArrayInputStream bais = new ByteArrayInputStream(content);
try {
return transform(
hostport,
new BufferedReader(new InputStreamReader(bais, encoding)),
content.length);
} catch (final IOException e) {
@ -77,10 +74,11 @@ abstract class AbstractWikiParser implements WikiParser {
}
}
public String transform(final byte[] content, final String encoding) throws UnsupportedEncodingException {
public String transform(String hostport, final byte[] content, final String encoding) throws UnsupportedEncodingException {
final ByteArrayInputStream bais = new ByteArrayInputStream(content);
try {
return transform(
hostport,
new BufferedReader(new InputStreamReader(bais, encoding)),
content.length);
} catch (final IOException e) {

@ -190,8 +190,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
* Constructor
* @param address
*/
public WikiCode(final String address) {
super(address);
public WikiCode() {
super();
}
/**
@ -201,12 +201,12 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
* @return HTML fragment.
* @throws IOException in case input from reader can not be read.
*/
protected String transform(final BufferedReader reader, final int length)
protected String transform(String hostport, final BufferedReader reader, final int length)
throws IOException {
final StringBuilder out = new StringBuilder(length);
String line;
while ((line = reader.readLine()) != null) {
out.append(processLineOfWikiCode(line)).append(serverCore.CRLF_STRING);
out.append(processLineOfWikiCode(hostport, line)).append(serverCore.CRLF_STRING);
}
return out.insert(0, createTableOfContents()).toString();
}
@ -531,7 +531,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
* @param line line of text to be transformed from wiki code to HTML
* @return HTML fragment
*/
private String processLinksAndImages(String line) {
private String processLinksAndImages(String hostport, String line) {
// create links
String kl, kv, alt, align;
@ -586,7 +586,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
// or an image DATA/HTDOCS/grafics/kaskelix.jpg with [[Image:grafics/kaskelix.jpg]]
// you are free to use other sub-paths of DATA/HTDOCS
if (kl.indexOf("://") < 1) {
kl = "http://" + super.address + "/" + kl;
kl = "http://" + hostport + "/" + kl;
}
line = line.substring(0, positionOfOpeningTag) + "<img src=\"" + kl + "\"" + align + alt + ">" + line.substring(positionOfClosingTag + 2);
@ -623,7 +623,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
// or a file DATA/HTDOCS/www/page.html with [www/page.html]
// you are free to use other sub-paths of DATA/HTDOCS
if (kl.indexOf("://") < 1) {
kl = "http://" + super.address + "/" + kl;
kl = "http://" + hostport + "/" + kl;
}
line = line.substring(0, positionOfOpeningTag) + "<a class=\"extern\" href=\"" + kl + "\">" + kv + "</a>" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_EXTERNAL_LINK);
}
@ -635,7 +635,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
* @param line line of text to be transformed from wiki code to HTML
* @return HTML fragment
*/
private String processPreformattedText(String line) {
private String processPreformattedText(String hostport, String line) {
if (!escaped) {
final int positionOfOpeningTag = line.indexOf(WIKI_OPEN_PRE_ESCAPED);
final int positionOfClosingTag = line.indexOf(WIKI_CLOSE_PRE_ESCAPED);
@ -647,15 +647,15 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
preformattedText.append(line.substring(positionOfOpeningTag + LEN_WIKI_OPEN_PRE_ESCAPED, positionOfClosingTag));
preformattedText.append("</pre>");
line = processLineOfWikiCode(line.substring(0, positionOfOpeningTag).replaceAll("!pre!", "!pre!!") + "!pre!txt!" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_PRE_ESCAPED).replaceAll("!pre!", "!pre!!"));
line = processLineOfWikiCode(hostport, line.substring(0, positionOfOpeningTag).replaceAll("!pre!", "!pre!!") + "!pre!txt!" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_PRE_ESCAPED).replaceAll("!pre!", "!pre!!"));
line = line.replaceAll("!pre!txt!", preformattedText.toString().replaceAll("!pre!", "!pre!!"));
line = line.replaceAll("!pre!!", "!pre!");
} //handles cases like <pre><pre> </pre></pre> <pre> </pre> that would cause an exception otherwise
else {
processingPreformattedText = true;
final String temp1 = processLineOfWikiCode(line.substring(0, positionOfOpeningTag - 1).replaceAll("!tmp!", "!tmp!!") + "!tmp!txt!");
final String temp1 = processLineOfWikiCode(hostport, line.substring(0, positionOfOpeningTag - 1).replaceAll("!tmp!", "!tmp!!") + "!tmp!txt!");
noList = true;
final String temp2 = processLineOfWikiCode(line.substring(positionOfOpeningTag));
final String temp2 = processLineOfWikiCode(hostport, line.substring(positionOfOpeningTag));
noList = false;
line = temp1.replaceAll("!tmp!txt!", temp2);
line = line.replaceAll("!tmp!!", "!tmp!");
@ -673,7 +673,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
preindented++;
openBlockQuoteTags.append(HTML_OPEN_BLOCKQUOTE);
}
line = processLineOfWikiCode(line.substring(preindented, positionOfOpeningTag).replaceAll("!pre!", "!pre!!") + "!pre!txt!");
line = processLineOfWikiCode(hostport, line.substring(preindented, positionOfOpeningTag).replaceAll("!pre!", "!pre!!") + "!pre!txt!");
line = openBlockQuoteTags + line.replaceAll("!pre!txt!", preformattedText);
line = line.replaceAll("!pre!!", "!pre!");
preformattedSpanning = true;
@ -688,7 +688,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
endBlockQuoteTags.append(HTML_CLOSE_BLOCKQUOTE);
preindented--;
}
line = processLineOfWikiCode("!pre!txt!" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_PRE_ESCAPED).replaceAll("!pre!", "!pre!!"));
line = processLineOfWikiCode(hostport, "!pre!txt!" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_PRE_ESCAPED).replaceAll("!pre!", "!pre!!"));
line = line.replaceAll("!pre!txt!", preformattedText) + endBlockQuoteTags;
line = line.replaceAll("!pre!!", "!pre!");
processingPreformattedText = false;
@ -698,7 +698,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
while ((posTag = line.indexOf(WIKI_CLOSE_PRE_ESCAPED)) >= 0) {
line = line.substring(0, posTag) + line.substring(posTag + LEN_WIKI_CLOSE_PRE_ESCAPED);
}
line = processLineOfWikiCode(line);
line = processLineOfWikiCode(hostport, line);
}
}
return line;
@ -914,7 +914,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
* @param line line of text to be transformed from wiki code to HTML
* @return HTML fragment
*/
public String processLineOfWikiCode(String line) {
public String processLineOfWikiCode(String hostport, String line) {
//If HTML has not been replaced yet (can happen if method gets called in recursion), replace now!
if ((!replacedHtmlAlready || preformattedSpanning) && line.indexOf(WIKI_CLOSE_PRE_ESCAPED) < 0) {
line = CharacterCoding.unicode2html(line, true);
@ -925,7 +925,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
if ((line.indexOf(WIKI_OPEN_PRE_ESCAPED) >= 0) ||
(line.indexOf(WIKI_CLOSE_PRE_ESCAPED) >= 0) ||
preformattedSpanning) {
line = processPreformattedText(line);
line = processPreformattedText(hostport, line);
} else {
//tables first -> wiki-tags in cells can be treated after that
@ -970,7 +970,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
line = processOrderedList(line);
line = processDefinitionList(line);
line = processLinksAndImages(line);
line = processLinksAndImages(hostport, line);
}

@ -29,8 +29,8 @@ import java.io.UnsupportedEncodingException;
public interface WikiParser {
public String transform(String text);
public String transform(byte[] text) throws UnsupportedEncodingException;
public String transform(byte[] text, String encoding) throws UnsupportedEncodingException;
public String transform(String hostport, String text);
public String transform(String hostport, byte[] text) throws UnsupportedEncodingException;
public String transform(String hostport, byte[] text, String encoding) throws UnsupportedEncodingException;
}

@ -543,7 +543,7 @@ public final class Switchboard extends serverSwitch {
log.logConfig("Initializing Snippet Cache");
// init the wiki
wikiParser = new WikiCode(this.peers.mySeed().getClusterAddress());
wikiParser = new WikiCode();
// initializing the resourceObserver
InstantBusyThread.oneTimeJob(ResourceObserver.class, "initThread", ResourceObserver.log, 0);
@ -822,7 +822,8 @@ public final class Switchboard extends serverSwitch {
SearchEventCache.cleanupEvents(true);
// switch the networks
synchronized (this) {
synchronized (this) {
// shut down
this.crawler.close();
this.dhtDispatcher.close();
@ -859,10 +860,8 @@ public final class Switchboard extends serverSwitch {
// relocate
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
final File mySeedFile = new File(this.networkRoot, yacySeedDB.DBFILE_OWN_SEED);
peers.relocate(
this.networkRoot,
mySeedFile,
redundancy,
partitionExponent,
this.useTailCache,

@ -229,13 +229,13 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
}
public String putWiki(final String key, final String wikiCode){
return this.put(key, Switchboard.wikiParser.transform(wikiCode));
public String putWiki(String hostport, final String key, final String wikiCode){
return this.put(key, Switchboard.wikiParser.transform(hostport, wikiCode));
}
public String putWiki(final String key, final byte[] wikiCode) {
public String putWiki(String hostport, final String key, final byte[] wikiCode) {
try {
return this.put(key, Switchboard.wikiParser.transform(wikiCode));
return this.put(key, Switchboard.wikiParser.transform(hostport, wikiCode));
} catch (final UnsupportedEncodingException e) {
return this.put(key, "Internal error pasting wiki-code: " + e.getMessage());
}

@ -145,11 +145,11 @@ public final class yacySeedDB implements AlternativeDomainNames {
public void relocate(
File newNetworkRoot,
final File myOwnSeedFile,
final int redundancy,
final int partitionExponent,
final boolean useTailCache,
final boolean exceed134217727) {
// close old databases
this.seedActiveDB.close();
this.seedPassiveDB.close();
@ -161,8 +161,13 @@ public final class yacySeedDB implements AlternativeDomainNames {
this.seedActiveDBFile = new File(newNetworkRoot, seedActiveDBFile.getName());
this.seedPassiveDBFile = new File(newNetworkRoot, seedPassiveDBFile.getName());
this.seedPotentialDBFile = new File(newNetworkRoot, seedPotentialDBFile.getName());
// read current peer name
String peername = this.myName();
this.mySeed = null; // my own seed
this.myOwnSeedFile = myOwnSeedFile;
this.myOwnSeedFile = new File(newNetworkRoot, yacySeedDB.DBFILE_OWN_SEED);
this.netRedundancy = redundancy;
this.scheme = new VerticalWordPartitionScheme(partitionExponent);
@ -275,7 +280,7 @@ public final class yacySeedDB implements AlternativeDomainNames {
} catch (final IOException e) { Log.logWarning("yacySeedDB", "could not remove hash ("+ e.getClass() +"): "+ e.getMessage()); }
}
protected void saveMySeed() {
public void saveMySeed() {
try {
this.mySeed().save(myOwnSeedFile);
} catch (final IOException e) { Log.logWarning("yacySeedDB", "could not save mySeed '"+ myOwnSeedFile +"': "+ e.getMessage()); }

@ -48,7 +48,6 @@ import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
@ -59,6 +58,7 @@ import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.zip.GZIPInputStream;
import de.anomic.data.wiki.WikiCode;
import de.anomic.data.wiki.WikiParser;
@ -81,24 +81,25 @@ public class MediawikiImporter extends Thread implements Importer {
public static Importer job; // if started from a servlet, this object is used to store the thread
protected WikiParser wparser;
protected String urlStub;
public File sourcefile;
public File targetdir;
public int count;
private long start;
private final long docsize;
private final int approxdocs;
private String hostport, urlStub;
public MediawikiImporter(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
public MediawikiImporter(File sourcefile, File targetdir) throws MalformedURLException {
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir;
this.urlStub = baseURL;
this.wparser = new WikiCode(new URL(baseURL).getHost());
this.wparser = new WikiCode();
this.count = 0;
this.start = 0;
this.hostport = null;
this.urlStub = null;
}
public int count() {
@ -138,14 +139,17 @@ public class MediawikiImporter extends Thread implements Importer {
this.start = System.currentTimeMillis();
try {
String targetstub = sourcefile.getName();
targetstub = targetstub.substring(0, targetstub.length() - 8);
InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1 * 1024 * 1024);
int p = targetstub.lastIndexOf("\\.");
if (p > 0) targetstub = targetstub.substring(0, p);
InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1024 * 1024);
if (sourcefile.getName().endsWith(".bz2")) {
int b = is.read();
if (b != 'B') throw new IOException("Invalid bz2 content.");
b = is.read();
if (b != 'Z') throw new IOException("Invalid bz2 content.");
is = new CBZip2InputStream(is);
} else if (sourcefile.getName().endsWith(".gz")) {
is = new GZIPInputStream(is);
}
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024);
String t;
@ -167,15 +171,27 @@ public class MediawikiImporter extends Thread implements Importer {
Future<Integer> writerResult = service.submit(writer);
wikiparserrecord record;
int p;
int q;
while ((t = r.readLine()) != null) {
if ((p = t.indexOf("<base>")) >= 0 && (q = t.indexOf("</base>", p)) > 0) {
//urlStub = "http://" + lang + ".wikipedia.org/wiki/";
urlStub = t.substring(p + 6, q);
if (!urlStub.endsWith("/")) {
q = urlStub.lastIndexOf('/');
if (q > 0) urlStub = urlStub.substring(0, q + 1);
}
DigestURI uri = new DigestURI(urlStub);
hostport = uri.getHost();
if (uri.getPort() != 80) hostport += ":" + uri.getPort();
continue;
}
if (t.indexOf(pagestart) >= 0) {
page = true;
continue;
}
if ((p = t.indexOf(textstart)) >= 0) {
text = page;
int q = t.indexOf('>', p + textstart.length());
q = t.indexOf('>', p + textstart.length());
if (q > 0) {
int u = t.indexOf(textend, q + 1);
if (u > q) {
@ -185,7 +201,7 @@ public class MediawikiImporter extends Thread implements Importer {
Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
continue;
}
record = newRecord(title, sb);
record = newRecord(hostport, urlStub, title, sb);
try {
in.put(record);
this.count++;
@ -207,7 +223,7 @@ public class MediawikiImporter extends Thread implements Importer {
Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
continue;
}
record = newRecord(title, sb);
record = newRecord(hostport, urlStub, title, sb);
try {
in.put(record);
this.count++;
@ -223,7 +239,7 @@ public class MediawikiImporter extends Thread implements Importer {
}
if ((p = t.indexOf("<title>")) >= 0) {
title = t.substring(p + 7);
int q = title.indexOf("</title>");
q = title.indexOf("</title>");
if (q >= 0) title = title.substring(0, q);
continue;
}
@ -461,25 +477,26 @@ public class MediawikiImporter extends Thread implements Importer {
}
}
public wikiparserrecord newRecord() {
return new wikiparserrecord(null, null);
return new wikiparserrecord(null, null, null, null);
}
public wikiparserrecord newRecord(String title, StringBuilder sb) {
return new wikiparserrecord(title, sb);
public wikiparserrecord newRecord(String hostport, String urlStub, String title, StringBuilder sb) {
return new wikiparserrecord(hostport, urlStub, title, sb);
}
public class wikiparserrecord {
public String title;
String source;
String html;
String source, html, hostport, urlStub;
DigestURI url;
Document document;
public wikiparserrecord(String title, StringBuilder sb) {
public wikiparserrecord(String hostport, String urlStub, String title, StringBuilder sb) {
this.title = title;
this.hostport = hostport;
this.urlStub = urlStub;
this.source = (sb == null) ? null : sb.toString();
}
public void genHTML() throws IOException {
try {
html = wparser.transform(source);
html = wparser.transform(hostport, source);
} catch (Exception e) {
Log.logException(e);
throw new IOException(e.getMessage());
@ -734,13 +751,13 @@ public class MediawikiImporter extends Thread implements Importer {
// example:
// java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert") && s.length > 2 && s[1].endsWith(".xml.bz2") && s[3].startsWith("http://")) {
if (s[0].equals("-convert") && s.length > 2) {
File sourcefile = new File(s[1]);
File targetdir = new File(s[2]);
String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
//String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
//String language = urlStub.substring(7,9);
try {
MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir, urlStub);
MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir);
mi.start();
mi.join();
} catch (InterruptedException e) {

Loading…
Cancel
Save