fix for mediawiki import

pull/1/head
Michael Peter Christen 10 years ago
parent b07afbc115
commit 3b51636ecb

@ -20,7 +20,7 @@
<a href="http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2">
http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2</a>.
<br />
Dumps must be in XML format and may be compressed in gz or bz2. Place the file in the YaCy folder or in one of its sub-folders.
Dumps must be stored in the local file system in XML format and may be compressed in gz or bz2.
<br />
<input name="file" type="text" value="" size="80" />
<input name="submit" type="submit" value="Import MediaWiki Dump" />

@ -53,7 +53,13 @@ public class IndexImportMediawiki_p {
prop.put("import_status", 0);
} else {
if (post.containsKey("file")) {
final File sourcefile = new File(post.get("file"));
String file = post.get("file");
if (file.startsWith("file://")) file = file.substring(7);
if (file.startsWith("http")) {
prop.put("import_dump", "");
prop.put("import_thread", "Error: file argument must be a path to a document in the local file system");
} else {
final File sourcefile = new File(file);
if (sourcefile.exists()) {
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
@ -63,6 +69,7 @@ public class IndexImportMediawiki_p {
prop.put("import_dump", "");
prop.put("import_thread", "Error: file not found ["+sourcefile+"]");
}
}
prop.put("import", 1);
prop.put("import_count", 0);
prop.put("import_speed", 0);

@ -2840,7 +2840,7 @@ public final class Switchboard extends serverSwitch {
}
// remove stopwords
this.log.info("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);
this.log.info("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url.toNormalform(true));
// STORE WORD INDEX
SolrInputDocument newEntry =

@ -718,7 +718,7 @@ public class Segment {
final long indexingEndTime = System.currentTimeMillis();
if (this.log.isInfo()) {
this.log.info("*Indexed " + condenser.words().size() + " words in URL " + url +
this.log.info("*Indexed " + condenser.words().size() + " words in URL " + url.toNormalform(true) +
" [" + id + "]" +
"\n\tDescription: " + dc_title +
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +

Loading…
Cancel
Save