added parsing rules for wikibooks and wikinews in zim reader

pull/621/head
Michael Peter Christen 1 year ago
parent 2ea54b3503
commit fdc6311dc7

@ -46,6 +46,7 @@ import org.openzim.ZIMReader.DirectoryEntry;
/** /**
* ZIM importer * ZIM importer
* can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/ * can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/
* A huge list is at https://wiki.kiwix.org/wiki/Content_in_all_languages
* These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains. * These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains.
* These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them. * These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them.
* For that we have a guessing function, but we must check if the guessing was correct by testing some of the given * For that we have a guessing function, but we must check if the guessing was correct by testing some of the given
@ -220,6 +221,10 @@ public class ZimImporter extends Thread implements Importer {
return "vikidia.org"; return "vikidia.org";
case "westeros": case "westeros":
return "westeros.org"; return "westeros.org";
case "wikibooks":
return parts[1] + ".wikibooks.org/wiki";
case "wikinews":
return parts[1] + ".wikinews.org/wiki";
case "wikipedia": case "wikipedia":
return parts[1] + ".wikipedia.org/wiki"; return parts[1] + ".wikipedia.org/wiki";
case "www.ready.gov": case "www.ready.gov":
@ -264,6 +269,9 @@ public class ZimImporter extends Thread implements Importer {
public static String guessURL(String guessedSource, DirectoryEntry de) { public static String guessURL(String guessedSource, DirectoryEntry de) {
String url = de.url; String url = de.url;
if (url.equals("Main_Page")) url = ""; if (url.equals("Main_Page")) url = "";
if (guessedSource != null) return guessedSource + url;
if (url.startsWith("A/")) return "https://" + url.substring(2);
if (url.startsWith("H/")) return "https://" + url.substring(2);
return guessedSource + url; return guessedSource + url;
} }

@ -46,7 +46,7 @@ import com.github.luben.zstd.ZstdInputStream;
*/ */
public class ZIMReader { public class ZIMReader {
private final static int MAX_CLUSTER_CACHE_SIZE = 10; private final static int MAX_CLUSTER_CACHE_SIZE = 100;
public final static String[] METADATA_KEYS = new String[] { public final static String[] METADATA_KEYS = new String[] {
"Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription", "Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription",
"Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper" "Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper"
@ -371,6 +371,8 @@ public class ZIMReader {
* This can of course only be done, if: * This can of course only be done, if:
* - we want to iterate through all documents of a ZIM file * - we want to iterate through all documents of a ZIM file
* - we have reverse indexed all directory entries to be able to assign metadata to cluster documents * - we have reverse indexed all directory entries to be able to assign metadata to cluster documents
*
* Reference implementation: https://github.com/openzim/libzim/blob/main/src/cluster.cpp
*/ */
private class Cluster { private class Cluster {

Loading…
Cancel
Save