From fdc6311dc7692d956a349ba0f98b6d33131b6efb Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Thu, 2 Nov 2023 00:27:24 +0100
Subject: [PATCH] added parsing rules for wikibooks and wikinews in zim reader

---
 source/net/yacy/document/importer/ZimImporter.java | 8 ++++++++
 source/org/openzim/ZIMReader.java                  | 4 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java
index a1917f0bc..a96a79b18 100644
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@@ -46,6 +46,7 @@ import org.openzim.ZIMReader.DirectoryEntry;
 /**
  * ZIM importer
  * can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/
+ * A huge list is at https://wiki.kiwix.org/wiki/Content_in_all_languages
  * These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains.
  * These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them.
  * For that we have a guessing function, but we must check if the guessing was correct by testing some of the given
@@ -220,6 +221,10 @@ public class ZimImporter extends Thread implements Importer {
                 return "vikidia.org";
             case "westeros":
                 return "westeros.org";
+            case "wikibooks":
+                return parts[1] + ".wikibooks.org/wiki";
+            case "wikinews":
+                return parts[1] + ".wikinews.org/wiki";
             case "wikipedia":
                 return parts[1] + ".wikipedia.org/wiki";
             case "www.ready.gov":
@@ -264,6 +269,9 @@ public class ZimImporter extends Thread implements Importer {
     public static String guessURL(String guessedSource, DirectoryEntry de) {
         String url = de.url;
         if (url.equals("Main_Page")) url = "";
+        if (guessedSource != null) return guessedSource + url;
+        if (url.startsWith("A/")) return "https://" + url.substring(2);
+        if (url.startsWith("H/")) return "https://" + url.substring(2);
         return guessedSource + url;
     }
 
diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java
index f94147341..bc39fd36b 100644
--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@@ -46,7 +46,7 @@ import com.github.luben.zstd.ZstdInputStream;
  */
 public class ZIMReader {
 
-    private final static int MAX_CLUSTER_CACHE_SIZE = 10;
+    private final static int MAX_CLUSTER_CACHE_SIZE = 100;
     public final static String[] METADATA_KEYS = new String[] {
             "Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription",
             "Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper"
@@ -371,6 +371,8 @@ public class ZIMReader {
      * This can of course only be done, if:
      * - we want to iterate through all documents of a ZIM file
      * - we have reverse indexed all directory entries to be able to assign metadata to cluster documents
+     * 
+     * Reference implementation: https://github.com/openzim/libzim/blob/main/src/cluster.cpp
      */
     private class Cluster {