From fdc6311dc7692d956a349ba0f98b6d33131b6efb Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 2 Nov 2023 00:27:24 +0100 Subject: [PATCH] added parsing rules for wikibooks and wikinews in zim reader --- source/net/yacy/document/importer/ZimImporter.java | 8 ++++++++ source/org/openzim/ZIMReader.java | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index a1917f0bc..a96a79b18 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -46,6 +46,7 @@ import org.openzim.ZIMReader.DirectoryEntry; /** * ZIM importer * can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/ + * A huge list is at https://wiki.kiwix.org/wiki/Content_in_all_languages * These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains. * These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them. * For that we have a guessing function, but we must check if the guessing was correct by testing some of the given @@ -220,6 +221,10 @@ public class ZimImporter extends Thread implements Importer { return "vikidia.org"; case "westeros": return "westeros.org"; + case "wikibooks": + return parts[1] + ".wikibooks.org/wiki"; + case "wikinews": + return parts[1] + ".wikinews.org/wiki"; case "wikipedia": return parts[1] + ".wikipedia.org/wiki"; case "www.ready.gov": @@ -264,6 +269,9 @@ public class ZimImporter extends Thread implements Importer { public static String guessURL(String guessedSource, DirectoryEntry de) { String url = de.url; if (url.equals("Main_Page")) url = ""; + if (guessedSource != null) return guessedSource + url; + if (url.startsWith("A/")) return "https://" + url.substring(2); + if (url.startsWith("H/")) return "https://" + url.substring(2); return guessedSource + url; } diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index f94147341..bc39fd36b 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -46,7 +46,7 @@ import com.github.luben.zstd.ZstdInputStream; */ public class ZIMReader { - private final static int MAX_CLUSTER_CACHE_SIZE = 10; + private final static int MAX_CLUSTER_CACHE_SIZE = 100; public final static String[] METADATA_KEYS = new String[] { "Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription", "Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper" @@ -371,6 +371,8 @@ public class ZIMReader { * This can of course only be done, if: * - we want to iterate through all documents of a ZIM file * - we have reverse indexed all directory entries to be able to assign metadata to cluster documents + * + * Reference implementation: https://github.com/openzim/libzim/blob/main/src/cluster.cpp */ private class Cluster {