fix for mediawiki import

pull/1/head
Michael Peter Christen 10 years ago
parent b07afbc115
commit 3b51636ecb

@ -20,7 +20,7 @@
<a href="http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2"> <a href="http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2">
http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2</a>. http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2</a>.
<br /> <br />
Dumps must be in XML format and may be compressed in gz or bz2. Place the file in the YaCy folder or in one of its sub-folders. Dumps must be stored in the local file system in XML format and may be compressed in gz or bz2.
<br /> <br />
<input name="file" type="text" value="" size="80" /> <input name="file" type="text" value="" size="80" />
<input name="submit" type="submit" value="Import MediaWiki Dump" /> <input name="submit" type="submit" value="Import MediaWiki Dump" />

@ -53,15 +53,22 @@ public class IndexImportMediawiki_p {
prop.put("import_status", 0); prop.put("import_status", 0);
} else { } else {
if (post.containsKey("file")) { if (post.containsKey("file")) {
final File sourcefile = new File(post.get("file")); String file = post.get("file");
if (sourcefile.exists()) { if (file.startsWith("file://")) file = file.substring(7);
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath); if (file.startsWith("http")) {
MediawikiImporter.job.start();
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_thread", "started");
} else {
prop.put("import_dump", ""); prop.put("import_dump", "");
prop.put("import_thread", "Error: file not found ["+sourcefile+"]"); prop.put("import_thread", "Error: file argument must be a path to a document in the local file system");
} else {
final File sourcefile = new File(file);
if (sourcefile.exists()) {
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_thread", "started");
} else {
prop.put("import_dump", "");
prop.put("import_thread", "Error: file not found ["+sourcefile+"]");
}
} }
prop.put("import", 1); prop.put("import", 1);
prop.put("import_count", 0); prop.put("import_count", 0);

@ -2840,7 +2840,7 @@ public final class Switchboard extends serverSwitch {
} }
// remove stopwords // remove stopwords
this.log.info("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url); this.log.info("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url.toNormalform(true));
// STORE WORD INDEX // STORE WORD INDEX
SolrInputDocument newEntry = SolrInputDocument newEntry =

@ -718,7 +718,7 @@ public class Segment {
final long indexingEndTime = System.currentTimeMillis(); final long indexingEndTime = System.currentTimeMillis();
if (this.log.isInfo()) { if (this.log.isInfo()) {
this.log.info("*Indexed " + condenser.words().size() + " words in URL " + url + this.log.info("*Indexed " + condenser.words().size() + " words in URL " + url.toNormalform(true) +
" [" + id + "]" + " [" + id + "]" +
"\n\tDescription: " + dc_title + "\n\tDescription: " + dc_title +
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " + "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +

Loading…
Cancel
Save