diff --git a/bin/importmediawiki.sh b/bin/importmediawiki.sh new file mode 100755 index 000000000..ee43d81c4 --- /dev/null +++ b/bin/importmediawiki.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd "`dirname $0`" +./apicall.sh /IndexImportWikimedia_p.html?file=$1 > /dev/null diff --git a/htroot/Blog.java b/htroot/Blog.java index 5571b6154..1ed3644b6 100644 --- a/htroot/Blog.java +++ b/htroot/Blog.java @@ -191,7 +191,7 @@ public class Blog { prop.putHTML("mode_author", UTF8.String(author)); prop.putHTML("mode_subject", post.get("subject","")); prop.put("mode_date", dateString(new Date())); - prop.putWiki("mode_page", post.get("content", "")); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", post.get("content", "")); prop.putHTML("mode_page-code", post.get("content", "")); } else { @@ -234,7 +234,7 @@ public class Blog { else { //only show 1 entry prop.put("mode_entries", "1"); - putBlogEntry(prop, page, address, 0, hasRights, xml); + putBlogEntry(sb, prop, page, address, 0, hasRights, xml); } } @@ -263,6 +263,7 @@ public class Blog { while (i.hasNext() && (num == 0 || num > count)) { if(0 < start--) continue; putBlogEntry( + switchboard, prop, switchboard.blogDB.readBlogEntry(i.next()), address, @@ -293,6 +294,7 @@ public class Blog { } private static serverObjects putBlogEntry( + final Switchboard sb, final serverObjects prop, final BlogBoard.BlogEntry entry, final String address, @@ -324,7 +326,7 @@ public class Blog { prop.put("mode_entries_" + number + "_page", entry.getPage()); prop.put("mode_entries_" + number + "_timestamp", entry.getTimestamp()); } else { - prop.putWiki("mode_entries_" + number + "_page", entry.getPage()); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_entries_" + number + "_page", entry.getPage()); } if (hasRights) { diff --git a/htroot/BlogComments.java b/htroot/BlogComments.java index 2b4ef5f94..040967b20 100644 --- a/htroot/BlogComments.java +++ b/htroot/BlogComments.java @@ -175,7 +175,7 @@ public class BlogComments { prop.putHTML("mode_allow_author", UTF8.String(author)); prop.putHTML("mode_subject", post.get("subject","")); prop.put("mode_date", dateString(new Date())); - prop.putWiki("mode_page", post.get("content", "")); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", post.get("content", "")); prop.put("mode_page-code", post.get("content", "")); } else { // show blog-entry/entries @@ -191,7 +191,7 @@ public class BlogComments { prop.putHTML("mode_allow_author", UTF8.String(author)); prop.put("mode_comments", page.getCommentsSize()); prop.put("mode_date", dateString(page.getDate())); - prop.putWiki("mode_page", page.getPage()); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", page.getPage()); if (hasRights) { prop.put("mode_admin", "1"); prop.put("mode_admin_pageid", page.getKey()); @@ -234,7 +234,7 @@ public class BlogComments { if (!xml) { prop.putHTML("mode_entries_"+count+"_subject", UTF8.String(entry.getSubject())); prop.putHTML("mode_entries_"+count+"_author", UTF8.String(entry.getAuthor())); - prop.putWiki("mode_entries_"+count+"_page", entry.getPage()); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_entries_"+count+"_page", entry.getPage()); } else { prop.putHTML("mode_entries_"+count+"_subject", UTF8.String(entry.getSubject())); prop.putHTML("mode_entries_"+count+"_author", UTF8.String(entry.getAuthor())); diff --git a/htroot/ConfigBasic.java b/htroot/ConfigBasic.java index 431217546..408a2d28d 100644 --- a/htroot/ConfigBasic.java +++ b/htroot/ConfigBasic.java @@ -103,11 +103,11 @@ public class ConfigBasic { // check if peer name already exists final yacySeed oldSeed = sb.peers.lookupByName(peerName); - if (oldSeed == null && !peerName.equals(sb.peers.mySeed().getName())) { - // the name is new - if (Pattern.compile("[A-Za-z0-9\\-_]{3,80}").matcher(peerName).matches()) { - sb.peers.mySeed().setName(peerName); - } + if (oldSeed == null && + !peerName.equals(sb.peers.mySeed().getName()) && + Pattern.compile("[A-Za-z0-9\\-_]{3,80}").matcher(peerName).matches()) { + sb.peers.mySeed().setName(peerName); + sb.peers.saveMySeed(); } // UPnP config diff --git a/htroot/IndexImportWikimedia_p.html b/htroot/IndexImportWikimedia_p.html index f3225568a..ebf688816 100644 --- a/htroot/IndexImportWikimedia_p.html +++ b/htroot/IndexImportWikimedia_p.html @@ -15,14 +15,14 @@
- Wikimedia Dump File Selection: select a 'bz2' file + Wikimedia Dump File Selection: select a xml file (which may be bz2- or gz-encoded) You can import Wikipedia dumps here. An example is the file http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2.
- Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading! + Dumps must be in XML format and may be compressed in gz or bz2. Uncompressed XML is also ok.
- +
diff --git a/htroot/IndexImportWikimedia_p.java b/htroot/IndexImportWikimedia_p.java index 336411140..17581de59 100644 --- a/htroot/IndexImportWikimedia_p.java +++ b/htroot/IndexImportWikimedia_p.java @@ -57,16 +57,17 @@ public class IndexImportWikimedia_p { } else { if (post.containsKey("file")) { final File sourcefile = new File(post.get("file")); - final String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2 + //final String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2 + /* if (!name.endsWith("pages-articles.xml.bz2")) { prop.put("import", 0); prop.put("import_status", 1); prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'"); return prop; } - final String lang = name.substring(0, 2); + */ try { - MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/"); + MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath); MediawikiImporter.job.start(); prop.put("import", 1); prop.put("import_thread", "started"); diff --git a/htroot/MessageSend_p.java b/htroot/MessageSend_p.java index 5d819d9a5..8ac9615f6 100644 --- a/htroot/MessageSend_p.java +++ b/htroot/MessageSend_p.java @@ -107,7 +107,7 @@ public class MessageSend_p { prop.putXML("mode_permission_message", message); prop.putHTML("mode_permission_hash", hash); if (post.containsKey("preview")) { - prop.putWiki("mode_permission_previewmessage", message); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_permission_previewmessage", message); } diff --git a/htroot/Messages_p.java b/htroot/Messages_p.java index 135d01b02..61193d2ca 100644 --- a/htroot/Messages_p.java +++ b/htroot/Messages_p.java @@ -160,7 +160,7 @@ public class Messages_p { prop.putXML("mode_subject", message.subject()); String theMessage = null; theMessage = UTF8.String(message.message()); - prop.putWiki("mode_message", theMessage); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_message", theMessage); prop.put("mode_hash", message.authorHash()); prop.putXML("mode_key", key); } diff --git a/htroot/ViewProfile.java b/htroot/ViewProfile.java index 0b2435acb..022fa40dc 100644 --- a/htroot/ViewProfile.java +++ b/htroot/ViewProfile.java @@ -162,7 +162,7 @@ public class ViewProfile { prop.put("success_" + key, "1"); // only comments get "wikified" if(key.equals("comment")){ - prop.putWiki( + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "success_" + key + "_value", entry.getValue().replaceAll("\r", "").replaceAll("\\\\n", "\n")); prop.put("success_" + key + "_b64value", Base64Order.standardCoder.encodeString(entry.getValue())); diff --git a/htroot/Wiki.java b/htroot/Wiki.java index f5575b3f7..c9021abc3 100644 --- a/htroot/Wiki.java +++ b/htroot/Wiki.java @@ -152,7 +152,7 @@ public class Wiki { prop.put("mode_display", display); prop.putHTML("mode_author", author); prop.put("mode_date", dateString(new Date())); - prop.putWiki("mode_page", post.get("content", "")); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", post.get("content", "")); prop.putHTML("mode_page-code", post.get("content", "")); } //end contrib of [MN] @@ -247,7 +247,7 @@ public class Wiki { prop.put("mode_versioning_display", display); prop.putHTML("mode_versioning_author", oentry.author()); prop.put("mode_versioning_date", dateString(oentry.date())); - prop.putWiki("mode_versioning_page", oentry.page()); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_versioning_page", oentry.page()); prop.putHTML("mode_versioning_page-code", UTF8.String(oentry.page())); } } catch (final IOException e) { @@ -263,7 +263,7 @@ public class Wiki { prop.put("mode_display", display); prop.putHTML("mode_author", page.author()); prop.put("mode_date", dateString(page.date())); - prop.putWiki("mode_page", page.page()); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "mode_page", page.page()); prop.put("controls", "0"); prop.putHTML("controls_pagename", pagename); diff --git a/htroot/mediawiki_p.java b/htroot/mediawiki_p.java index 2110b8260..a73a29386 100644 --- a/htroot/mediawiki_p.java +++ b/htroot/mediawiki_p.java @@ -71,7 +71,7 @@ public class mediawiki_p { page = page.substring(p, q); prop.putHTML("title", title); - prop.putWiki("page", page); + prop.putWiki(sb.peers.mySeed().getClusterAddress(), "page", page); return prop; } diff --git a/source/de/anomic/data/wiki/AbstractWikiParser.java b/source/de/anomic/data/wiki/AbstractWikiParser.java index ef59ee34b..202ccbc71 100644 --- a/source/de/anomic/data/wiki/AbstractWikiParser.java +++ b/source/de/anomic/data/wiki/AbstractWikiParser.java @@ -34,17 +34,12 @@ import java.io.UnsupportedEncodingException; abstract class AbstractWikiParser implements WikiParser { - final String address; - - public AbstractWikiParser(final String address) { - this.address = address; - } - - protected abstract String transform(BufferedReader reader, int length) throws IOException; + protected abstract String transform(String hostport, BufferedReader reader, int length) throws IOException; - public String transform(final String content) { + public String transform(String hostport, final String content) { try { return transform( + hostport, new BufferedReader(new StringReader(content)), content.length()); } catch (final IOException e) { @@ -52,9 +47,10 @@ abstract class AbstractWikiParser implements WikiParser { } } - public String transform(final String content, final String publicAddress) { + public String transform(String hostport, final String content, final String publicAddress) { try { return transform( + hostport, new BufferedReader(new StringReader(content)), content.length()); } catch (final IOException e) { @@ -62,14 +58,15 @@ abstract class AbstractWikiParser implements WikiParser { } } - public String transform(final byte[] content) throws UnsupportedEncodingException { - return transform(content, "UTF-8"); + public String transform(String hostport, final byte[] content) throws UnsupportedEncodingException { + return transform(hostport, content, "UTF-8"); } - public String transform(final byte[] content, final String encoding, final String publicAddress) { + public String transform(String hostport, final byte[] content, final String encoding, final String publicAddress) { final ByteArrayInputStream bais = new ByteArrayInputStream(content); try { return transform( + hostport, new BufferedReader(new InputStreamReader(bais, encoding)), content.length); } catch (final IOException e) { @@ -77,10 +74,11 @@ abstract class AbstractWikiParser implements WikiParser { } } - public String transform(final byte[] content, final String encoding) throws UnsupportedEncodingException { + public String transform(String hostport, final byte[] content, final String encoding) throws UnsupportedEncodingException { final ByteArrayInputStream bais = new ByteArrayInputStream(content); try { return transform( + hostport, new BufferedReader(new InputStreamReader(bais, encoding)), content.length); } catch (final IOException e) { diff --git a/source/de/anomic/data/wiki/WikiCode.java b/source/de/anomic/data/wiki/WikiCode.java index c9a06fa6b..54b6b43f9 100644 --- a/source/de/anomic/data/wiki/WikiCode.java +++ b/source/de/anomic/data/wiki/WikiCode.java @@ -190,8 +190,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { * Constructor * @param address */ - public WikiCode(final String address) { - super(address); + public WikiCode() { + super(); } /** @@ -201,12 +201,12 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { * @return HTML fragment. * @throws IOException in case input from reader can not be read. */ - protected String transform(final BufferedReader reader, final int length) + protected String transform(String hostport, final BufferedReader reader, final int length) throws IOException { final StringBuilder out = new StringBuilder(length); String line; while ((line = reader.readLine()) != null) { - out.append(processLineOfWikiCode(line)).append(serverCore.CRLF_STRING); + out.append(processLineOfWikiCode(hostport, line)).append(serverCore.CRLF_STRING); } return out.insert(0, createTableOfContents()).toString(); } @@ -531,7 +531,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { * @param line line of text to be transformed from wiki code to HTML * @return HTML fragment */ - private String processLinksAndImages(String line) { + private String processLinksAndImages(String hostport, String line) { // create links String kl, kv, alt, align; @@ -586,7 +586,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { // or an image DATA/HTDOCS/grafics/kaskelix.jpg with [[Image:grafics/kaskelix.jpg]] // you are free to use other sub-paths of DATA/HTDOCS if (kl.indexOf("://") < 1) { - kl = "http://" + super.address + "/" + kl; + kl = "http://" + hostport + "/" + kl; } line = line.substring(0, positionOfOpeningTag) + "" + line.substring(positionOfClosingTag + 2); @@ -623,7 +623,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { // or a file DATA/HTDOCS/www/page.html with [www/page.html] // you are free to use other sub-paths of DATA/HTDOCS if (kl.indexOf("://") < 1) { - kl = "http://" + super.address + "/" + kl; + kl = "http://" + hostport + "/" + kl; } line = line.substring(0, positionOfOpeningTag) + "" + kv + "" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_EXTERNAL_LINK); } @@ -635,7 +635,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { * @param line line of text to be transformed from wiki code to HTML * @return HTML fragment */ - private String processPreformattedText(String line) { + private String processPreformattedText(String hostport, String line) { if (!escaped) { final int positionOfOpeningTag = line.indexOf(WIKI_OPEN_PRE_ESCAPED); final int positionOfClosingTag = line.indexOf(WIKI_CLOSE_PRE_ESCAPED); @@ -647,15 +647,15 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { preformattedText.append(line.substring(positionOfOpeningTag + LEN_WIKI_OPEN_PRE_ESCAPED, positionOfClosingTag)); preformattedText.append(""); - line = processLineOfWikiCode(line.substring(0, positionOfOpeningTag).replaceAll("!pre!", "!pre!!") + "!pre!txt!" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_PRE_ESCAPED).replaceAll("!pre!", "!pre!!")); + line = processLineOfWikiCode(hostport, line.substring(0, positionOfOpeningTag).replaceAll("!pre!", "!pre!!") + "!pre!txt!" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_PRE_ESCAPED).replaceAll("!pre!", "!pre!!")); line = line.replaceAll("!pre!txt!", preformattedText.toString().replaceAll("!pre!", "!pre!!")); line = line.replaceAll("!pre!!", "!pre!"); } //handles cases like
 
 
that would cause an exception otherwise else { processingPreformattedText = true; - final String temp1 = processLineOfWikiCode(line.substring(0, positionOfOpeningTag - 1).replaceAll("!tmp!", "!tmp!!") + "!tmp!txt!"); + final String temp1 = processLineOfWikiCode(hostport, line.substring(0, positionOfOpeningTag - 1).replaceAll("!tmp!", "!tmp!!") + "!tmp!txt!"); noList = true; - final String temp2 = processLineOfWikiCode(line.substring(positionOfOpeningTag)); + final String temp2 = processLineOfWikiCode(hostport, line.substring(positionOfOpeningTag)); noList = false; line = temp1.replaceAll("!tmp!txt!", temp2); line = line.replaceAll("!tmp!!", "!tmp!"); @@ -673,7 +673,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { preindented++; openBlockQuoteTags.append(HTML_OPEN_BLOCKQUOTE); } - line = processLineOfWikiCode(line.substring(preindented, positionOfOpeningTag).replaceAll("!pre!", "!pre!!") + "!pre!txt!"); + line = processLineOfWikiCode(hostport, line.substring(preindented, positionOfOpeningTag).replaceAll("!pre!", "!pre!!") + "!pre!txt!"); line = openBlockQuoteTags + line.replaceAll("!pre!txt!", preformattedText); line = line.replaceAll("!pre!!", "!pre!"); preformattedSpanning = true; @@ -688,7 +688,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { endBlockQuoteTags.append(HTML_CLOSE_BLOCKQUOTE); preindented--; } - line = processLineOfWikiCode("!pre!txt!" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_PRE_ESCAPED).replaceAll("!pre!", "!pre!!")); + line = processLineOfWikiCode(hostport, "!pre!txt!" + line.substring(positionOfClosingTag + LEN_WIKI_CLOSE_PRE_ESCAPED).replaceAll("!pre!", "!pre!!")); line = line.replaceAll("!pre!txt!", preformattedText) + endBlockQuoteTags; line = line.replaceAll("!pre!!", "!pre!"); processingPreformattedText = false; @@ -698,7 +698,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { while ((posTag = line.indexOf(WIKI_CLOSE_PRE_ESCAPED)) >= 0) { line = line.substring(0, posTag) + line.substring(posTag + LEN_WIKI_CLOSE_PRE_ESCAPED); } - line = processLineOfWikiCode(line); + line = processLineOfWikiCode(hostport, line); } } return line; @@ -914,7 +914,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { * @param line line of text to be transformed from wiki code to HTML * @return HTML fragment */ - public String processLineOfWikiCode(String line) { + public String processLineOfWikiCode(String hostport, String line) { //If HTML has not been replaced yet (can happen if method gets called in recursion), replace now! if ((!replacedHtmlAlready || preformattedSpanning) && line.indexOf(WIKI_CLOSE_PRE_ESCAPED) < 0) { line = CharacterCoding.unicode2html(line, true); @@ -925,7 +925,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { if ((line.indexOf(WIKI_OPEN_PRE_ESCAPED) >= 0) || (line.indexOf(WIKI_CLOSE_PRE_ESCAPED) >= 0) || preformattedSpanning) { - line = processPreformattedText(line); + line = processPreformattedText(hostport, line); } else { //tables first -> wiki-tags in cells can be treated after that @@ -970,7 +970,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { line = processOrderedList(line); line = processDefinitionList(line); - line = processLinksAndImages(line); + line = processLinksAndImages(hostport, line); } diff --git a/source/de/anomic/data/wiki/WikiParser.java b/source/de/anomic/data/wiki/WikiParser.java index 32e06b8a7..88b46de0e 100644 --- a/source/de/anomic/data/wiki/WikiParser.java +++ b/source/de/anomic/data/wiki/WikiParser.java @@ -29,8 +29,8 @@ import java.io.UnsupportedEncodingException; public interface WikiParser { - public String transform(String text); - public String transform(byte[] text) throws UnsupportedEncodingException; - public String transform(byte[] text, String encoding) throws UnsupportedEncodingException; + public String transform(String hostport, String text); + public String transform(String hostport, byte[] text) throws UnsupportedEncodingException; + public String transform(String hostport, byte[] text, String encoding) throws UnsupportedEncodingException; } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index b992937b5..72d4aba7b 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -543,7 +543,7 @@ public final class Switchboard extends serverSwitch { log.logConfig("Initializing Snippet Cache"); // init the wiki - wikiParser = new WikiCode(this.peers.mySeed().getClusterAddress()); + wikiParser = new WikiCode(); // initializing the resourceObserver InstantBusyThread.oneTimeJob(ResourceObserver.class, "initThread", ResourceObserver.log, 0); @@ -822,7 +822,8 @@ public final class Switchboard extends serverSwitch { SearchEventCache.cleanupEvents(true); // switch the networks - synchronized (this) { + synchronized (this) { + // shut down this.crawler.close(); this.dhtDispatcher.close(); @@ -859,10 +860,8 @@ public final class Switchboard extends serverSwitch { // relocate this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object - final File mySeedFile = new File(this.networkRoot, yacySeedDB.DBFILE_OWN_SEED); peers.relocate( this.networkRoot, - mySeedFile, redundancy, partitionExponent, this.useTailCache, diff --git a/source/de/anomic/server/serverObjects.java b/source/de/anomic/server/serverObjects.java index e5f97d01d..6d82edc0e 100644 --- a/source/de/anomic/server/serverObjects.java +++ b/source/de/anomic/server/serverObjects.java @@ -229,13 +229,13 @@ public class serverObjects extends HashMap implements Cloneable } - public String putWiki(final String key, final String wikiCode){ - return this.put(key, Switchboard.wikiParser.transform(wikiCode)); + public String putWiki(String hostport, final String key, final String wikiCode){ + return this.put(key, Switchboard.wikiParser.transform(hostport, wikiCode)); } - public String putWiki(final String key, final byte[] wikiCode) { + public String putWiki(String hostport, final String key, final byte[] wikiCode) { try { - return this.put(key, Switchboard.wikiParser.transform(wikiCode)); + return this.put(key, Switchboard.wikiParser.transform(hostport, wikiCode)); } catch (final UnsupportedEncodingException e) { return this.put(key, "Internal error pasting wiki-code: " + e.getMessage()); } diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index 413e695be..ca3e98ff4 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -145,11 +145,11 @@ public final class yacySeedDB implements AlternativeDomainNames { public void relocate( File newNetworkRoot, - final File myOwnSeedFile, final int redundancy, final int partitionExponent, final boolean useTailCache, final boolean exceed134217727) { + // close old databases this.seedActiveDB.close(); this.seedPassiveDB.close(); @@ -161,8 +161,13 @@ public final class yacySeedDB implements AlternativeDomainNames { this.seedActiveDBFile = new File(newNetworkRoot, seedActiveDBFile.getName()); this.seedPassiveDBFile = new File(newNetworkRoot, seedPassiveDBFile.getName()); this.seedPotentialDBFile = new File(newNetworkRoot, seedPotentialDBFile.getName()); + + + // read current peer name + String peername = this.myName(); + this.mySeed = null; // my own seed - this.myOwnSeedFile = myOwnSeedFile; + this.myOwnSeedFile = new File(newNetworkRoot, yacySeedDB.DBFILE_OWN_SEED); this.netRedundancy = redundancy; this.scheme = new VerticalWordPartitionScheme(partitionExponent); @@ -275,7 +280,7 @@ public final class yacySeedDB implements AlternativeDomainNames { } catch (final IOException e) { Log.logWarning("yacySeedDB", "could not remove hash ("+ e.getClass() +"): "+ e.getMessage()); } } - protected void saveMySeed() { + public void saveMySeed() { try { this.mySeed().save(myOwnSeedFile); } catch (final IOException e) { Log.logWarning("yacySeedDB", "could not save mySeed '"+ myOwnSeedFile +"': "+ e.getMessage()); } diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 312501803..570c0268f 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -48,7 +48,6 @@ import java.io.PrintWriter; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; -import java.net.URL; import java.util.Date; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; @@ -59,6 +58,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.zip.GZIPInputStream; import de.anomic.data.wiki.WikiCode; import de.anomic.data.wiki.WikiParser; @@ -81,24 +81,25 @@ public class MediawikiImporter extends Thread implements Importer { public static Importer job; // if started from a servlet, this object is used to store the thread protected WikiParser wparser; - protected String urlStub; public File sourcefile; public File targetdir; public int count; private long start; private final long docsize; private final int approxdocs; + private String hostport, urlStub; - public MediawikiImporter(File sourcefile, File targetdir, String baseURL) throws MalformedURLException { + public MediawikiImporter(File sourcefile, File targetdir) throws MalformedURLException { this.sourcefile = sourcefile; this.docsize = sourcefile.length(); this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L); this.targetdir = targetdir; - this.urlStub = baseURL; - this.wparser = new WikiCode(new URL(baseURL).getHost()); + this.wparser = new WikiCode(); this.count = 0; this.start = 0; + this.hostport = null; + this.urlStub = null; } public int count() { @@ -138,14 +139,17 @@ public class MediawikiImporter extends Thread implements Importer { this.start = System.currentTimeMillis(); try { String targetstub = sourcefile.getName(); - targetstub = targetstub.substring(0, targetstub.length() - 8); - InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1 * 1024 * 1024); + int p = targetstub.lastIndexOf("\\."); + if (p > 0) targetstub = targetstub.substring(0, p); + InputStream is = new BufferedInputStream(new FileInputStream(sourcefile), 1024 * 1024); if (sourcefile.getName().endsWith(".bz2")) { int b = is.read(); if (b != 'B') throw new IOException("Invalid bz2 content."); b = is.read(); if (b != 'Z') throw new IOException("Invalid bz2 content."); is = new CBZip2InputStream(is); + } else if (sourcefile.getName().endsWith(".gz")) { + is = new GZIPInputStream(is); } BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 4 * 1024 * 1024); String t; @@ -167,15 +171,27 @@ public class MediawikiImporter extends Thread implements Importer { Future writerResult = service.submit(writer); wikiparserrecord record; - int p; + int q; while ((t = r.readLine()) != null) { + if ((p = t.indexOf("")) >= 0 && (q = t.indexOf("", p)) > 0) { + //urlStub = "http://" + lang + ".wikipedia.org/wiki/"; + urlStub = t.substring(p + 6, q); + if (!urlStub.endsWith("/")) { + q = urlStub.lastIndexOf('/'); + if (q > 0) urlStub = urlStub.substring(0, q + 1); + } + DigestURI uri = new DigestURI(urlStub); + hostport = uri.getHost(); + if (uri.getPort() != 80) hostport += ":" + uri.getPort(); + continue; + } if (t.indexOf(pagestart) >= 0) { page = true; continue; } if ((p = t.indexOf(textstart)) >= 0) { text = page; - int q = t.indexOf('>', p + textstart.length()); + q = t.indexOf('>', p + textstart.length()); if (q > 0) { int u = t.indexOf(textend, q + 1); if (u > q) { @@ -185,7 +201,7 @@ public class MediawikiImporter extends Thread implements Importer { Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content"); continue; } - record = newRecord(title, sb); + record = newRecord(hostport, urlStub, title, sb); try { in.put(record); this.count++; @@ -207,7 +223,7 @@ public class MediawikiImporter extends Thread implements Importer { Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content"); continue; } - record = newRecord(title, sb); + record = newRecord(hostport, urlStub, title, sb); try { in.put(record); this.count++; @@ -223,7 +239,7 @@ public class MediawikiImporter extends Thread implements Importer { } if ((p = t.indexOf("")) >= 0) { title = t.substring(p + 7); - int q = title.indexOf(""); + q = title.indexOf(""); if (q >= 0) title = title.substring(0, q); continue; } @@ -461,25 +477,26 @@ public class MediawikiImporter extends Thread implements Importer { } } public wikiparserrecord newRecord() { - return new wikiparserrecord(null, null); + return new wikiparserrecord(null, null, null, null); } - public wikiparserrecord newRecord(String title, StringBuilder sb) { - return new wikiparserrecord(title, sb); + public wikiparserrecord newRecord(String hostport, String urlStub, String title, StringBuilder sb) { + return new wikiparserrecord(hostport, urlStub, title, sb); } public class wikiparserrecord { public String title; - String source; - String html; + String source, html, hostport, urlStub; DigestURI url; Document document; - public wikiparserrecord(String title, StringBuilder sb) { + public wikiparserrecord(String hostport, String urlStub, String title, StringBuilder sb) { this.title = title; + this.hostport = hostport; + this.urlStub = urlStub; this.source = (sb == null) ? null : sb.toString(); } public void genHTML() throws IOException { try { - html = wparser.transform(source); + html = wparser.transform(hostport, source); } catch (Exception e) { Log.logException(e); throw new IOException(e.getMessage()); @@ -734,13 +751,13 @@ public class MediawikiImporter extends Thread implements Importer { // example: // java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ - if (s[0].equals("-convert") && s.length > 2 && s[1].endsWith(".xml.bz2") && s[3].startsWith("http://")) { + if (s[0].equals("-convert") && s.length > 2) { File sourcefile = new File(s[1]); File targetdir = new File(s[2]); - String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/ + //String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/ //String language = urlStub.substring(7,9); try { - MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir, urlStub); + MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir); mi.start(); mi.join(); } catch (InterruptedException e) {