From 3b51636ecbc8580b5ff8a5e3e94ac92d2baad6a9 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 12 Jan 2015 00:35:47 +0100 Subject: [PATCH] fix for mediawiki import --- htroot/IndexImportMediawiki_p.html | 2 +- htroot/IndexImportMediawiki_p.java | 23 +++++++++++++++-------- source/net/yacy/search/Switchboard.java | 2 +- source/net/yacy/search/index/Segment.java | 2 +- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/htroot/IndexImportMediawiki_p.html b/htroot/IndexImportMediawiki_p.html index f106e278d..3bf333b76 100644 --- a/htroot/IndexImportMediawiki_p.html +++ b/htroot/IndexImportMediawiki_p.html @@ -20,7 +20,7 @@ http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2.
- Dumps must be in XML format and may be compressed in gz or bz2. Place the file in the YaCy folder or in one of its sub-folders. + Dumps must be stored in the local file system in XML format and may be compressed in gz or bz2.
diff --git a/htroot/IndexImportMediawiki_p.java b/htroot/IndexImportMediawiki_p.java index f36fd4b57..37a1c6b87 100644 --- a/htroot/IndexImportMediawiki_p.java +++ b/htroot/IndexImportMediawiki_p.java @@ -53,15 +53,22 @@ public class IndexImportMediawiki_p { prop.put("import_status", 0); } else { if (post.containsKey("file")) { - final File sourcefile = new File(post.get("file")); - if (sourcefile.exists()) { - MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath); - MediawikiImporter.job.start(); - prop.put("import_dump", MediawikiImporter.job.source()); - prop.put("import_thread", "started"); - } else { + String file = post.get("file"); + if (file.startsWith("file://")) file = file.substring(7); + if (file.startsWith("http")) { prop.put("import_dump", ""); - prop.put("import_thread", "Error: file not found ["+sourcefile+"]"); + prop.put("import_thread", "Error: file argument must be a path to a document in the local file system"); + } else { + final File sourcefile = new File(file); + if (sourcefile.exists()) { + MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath); + MediawikiImporter.job.start(); + prop.put("import_dump", MediawikiImporter.job.source()); + prop.put("import_thread", "started"); + } else { + prop.put("import_dump", ""); + prop.put("import_thread", "Error: file not found ["+sourcefile+"]"); + } } prop.put("import", 1); prop.put("import_count", 0); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 4b9f29f33..d735dba3a 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2840,7 +2840,7 @@ public final class Switchboard extends serverSwitch { } // remove stopwords - this.log.info("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url); + this.log.info("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url.toNormalform(true)); // STORE WORD INDEX SolrInputDocument newEntry = diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 898d1886b..295ca338d 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -718,7 +718,7 @@ public class Segment { final long indexingEndTime = System.currentTimeMillis(); if (this.log.isInfo()) { - this.log.info("*Indexed " + condenser.words().size() + " words in URL " + url + + this.log.info("*Indexed " + condenser.words().size() + " words in URL " + url.toNormalform(true) + " [" + id + "]" + "\n\tDescription: " + dc_title + "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +