added parsing rules for wikibooks and wikinews in zim reader

pull/621/head
Michael Peter Christen 1 year ago
parent 2ea54b3503
commit fdc6311dc7

@ -46,6 +46,7 @@ import org.openzim.ZIMReader.DirectoryEntry;
/**
* ZIM importer
* can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/
* A huge list is at https://wiki.kiwix.org/wiki/Content_in_all_languages
* These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains.
* These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them.
* For that we have a guessing function, but we must check if the guessing was correct by testing some of the given
@ -220,6 +221,10 @@ public class ZimImporter extends Thread implements Importer {
return "vikidia.org";
case "westeros":
return "westeros.org";
case "wikibooks":
return parts[1] + ".wikibooks.org/wiki";
case "wikinews":
return parts[1] + ".wikinews.org/wiki";
case "wikipedia":
return parts[1] + ".wikipedia.org/wiki";
case "www.ready.gov":
@ -264,6 +269,9 @@ public class ZimImporter extends Thread implements Importer {
public static String guessURL(String guessedSource, DirectoryEntry de) {
String url = de.url;
if (url.equals("Main_Page")) url = "";
if (guessedSource != null) return guessedSource + url;
if (url.startsWith("A/")) return "https://" + url.substring(2);
if (url.startsWith("H/")) return "https://" + url.substring(2);
return guessedSource + url;
}

@ -46,7 +46,7 @@ import com.github.luben.zstd.ZstdInputStream;
*/
public class ZIMReader {
private final static int MAX_CLUSTER_CACHE_SIZE = 10;
private final static int MAX_CLUSTER_CACHE_SIZE = 100;
public final static String[] METADATA_KEYS = new String[] {
"Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription",
"Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper"
@ -371,6 +371,8 @@ public class ZIMReader {
* This can of course only be done, if:
* - we want to iterate through all documents of a ZIM file
* - we have reverse indexed all directory entries to be able to assign metadata to cluster documents
*
* Reference implementation: https://github.com/openzim/libzim/blob/main/src/cluster.cpp
*/
private class Cluster {

Loading…
Cancel
Save