From 54fa5d3c2eebfa3310e31c3baf1c6265dbd71256 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Wed, 1 Nov 2023 19:52:44 +0100
Subject: [PATCH] added a cluster cache but it requires more testing

---
 source/org/openzim/ZIMFile.java   |   6 +-
 source/org/openzim/ZIMReader.java | 108 ++++++++++++++++++++++++++++--
 2 files changed, 104 insertions(+), 10 deletions(-)
diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java
index 45f1e1789..906bf30a9 100644
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@@ -47,9 +47,9 @@ public class ZIMFile extends File {
     public final int  header_minorVersion;
     public final int  header_entryCount;
     public final int  header_clusterCount;
-    public final long header_urlPtrPos;
-    public final long header_titlePtrPos;
-    public final long header_clusterPtrPos;
+    private final long header_urlPtrPos;
+    private final long header_titlePtrPos;
+    private final long header_clusterPtrPos;
     public final long header_mimeListPos;
     public final int  header_mainPage;
     public final int  header_layoutPage;
diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java
index 9a1f77c5d..363153a94 100644
--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@@ -38,20 +38,24 @@ import com.github.luben.zstd.ZstdInputStream;
  *         Proof-Reading, unclustering, refactoring,
  *         naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
  *         change of Exception handling, 
- *         extension to more attributes as defined in spec (bugfix for mime type loading)
+ *         extension to more attributes as defined in spec (bugfix for mime type loading),
  *         bugfix to long parsing (prevented reading of large files),
- *         added extended cluster size parsing
- *         added ZStandard compression parsing (cluster type 5)
- *         added cluster index
+ *         added extended cluster size parsing,
+ *         added ZStandard compression parsing (cluster type 5),
+ *         added cluster index and cluster iteration for efficient blob extraction
  */
 public class ZIMReader {
 
+    private final static int MAX_CLUSTER_CACHE_SIZE = 10;
     public final static String[] METADATA_KEYS = new String[] {
             "Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription",
             "Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper"
     };
 
     private final ZIMFile mFile;
+    private List<ArticleEntry> allArticlesCache = null;
+    private Map<Integer, Map<Integer, ArticleEntry>> indexedArticlesCache = null;
+    private final ArrayList<Cluster> clusterCache = new ArrayList<>();
 
     public class DirectoryEntry {
 
@@ -132,15 +136,18 @@ public class ZIMReader {
     }
 
     public List<ArticleEntry> getAllArticles() throws IOException {
+        if (this.allArticlesCache != null) return allArticlesCache;
         List<ArticleEntry> list = new ArrayList<>();
         for (int i = 0; i < this.mFile.header_entryCount; i++) {
             DirectoryEntry de = getDirectoryInfo(i);
             if (de instanceof ArticleEntry) list.add((ArticleEntry) de);
         }
+        this.allArticlesCache = list;
         return list;
     }
 
     public Map<Integer, Map<Integer, ArticleEntry>> getIndexedArticles(List<ArticleEntry> list) {
+        if (this.indexedArticlesCache != null) return indexedArticlesCache;
         Map<Integer, Map<Integer, ArticleEntry>> index = new HashMap<>();
         for (ArticleEntry entry: list) {
             Map<Integer, ArticleEntry> cluster = index.get(entry.cluster_number);
@@ -150,9 +157,23 @@ public class ZIMReader {
             }
             cluster.put(entry.blob_number, entry);
         }
+        this.indexedArticlesCache = index;
         return index;
     }
 
+    /**
+     * A cluster iterator is the most efficient way to read all documents.
+     * Because iteration over the documents will cause that clusters are
+     * decompressed many times (as much as documents are in the cluster)
+     * it makes more sense to iterate over the clusters and not over the
+     * documents. That requires that we maintain an index of document entries
+     * which can be used to find out which documents are actually contained
+     * in a cluster. Reading of all document entries at first will create some
+     * waiting time at the beginning of the iteration, but this is not a on-top
+     * computing time, just concentrated for once at the beginning of all
+     * document fetch times. If the zim file is very large, this requires
+     * some extra RAM to cache the indexed document entries.
+     */
     public class ClusterIterator implements Iterator<ArticleBlobEntry> {
 
         private Map<Integer, Map<Integer, ArticleEntry>> index;
@@ -191,7 +212,7 @@ public class ZIMReader {
             Map<Integer, ArticleEntry> clusterMap = this.index.get(this.clusterCounter);
             ArticleEntry ae = clusterMap.get(this.blobCounter);
             loadCluster(); // ensure cluster is loaded
-            ArticleBlobEntry abe = new ArticleBlobEntry(ae, this.cluster.blobs.get(this.blobCounter));
+            ArticleBlobEntry abe = new ArticleBlobEntry(ae, this.cluster.getBlob(this.blobCounter));
 
             // increase the counter(s)
             this.blobCounter++;
@@ -313,6 +334,35 @@ public class ZIMReader {
         return null;
     }
 
+    public Cluster getCluster(int clusterNumber) throws IOException {
+        for (int i = 0; i < this.clusterCache.size(); i++) {
+            Cluster c = clusterCache.get(i);
+            if (c.cluster_number == clusterNumber) {
+                c.incUsage(); // cache hit
+                return c;
+            }
+        }
+
+        // cache miss
+        Cluster c = new Cluster(clusterNumber);
+
+        // check cache size
+        if (clusterCache.size() >= MAX_CLUSTER_CACHE_SIZE) {
+            // remove one entry
+            double minEntry = Double.MAX_VALUE;
+            int pos = -1;
+            for (int i = 0; i < clusterCache.size(); i++) {
+                double r = this.clusterCache.get(i).getUsageRatio();
+                if (r < minEntry) {minEntry = r; pos = i;}
+            }
+            if (pos >= 0) this.clusterCache.remove(pos);
+        }
+
+        c.incUsage();
+        this.clusterCache.add(c);
+        return c;
+    }
+
     /**
      * Cluster class is required to read a whole cluster with all documents inside at once.
      * This is a good thing because reading single documents from a cluster requires that the
@@ -324,10 +374,14 @@ public class ZIMReader {
      */
     private class Cluster {
 
+        private int cluster_number; // used to identify the correct cache entry
         private List<byte[]> blobs;
+        private int usageCounter; // used for efficient caching and cache stale detection
         private boolean extended;
 
         public Cluster(int cluster_number) throws IOException {
+            this.cluster_number = cluster_number;
+            this.usageCounter = 0;
 
             // open the cluster and make a Input Stream with the proper decompression type
             final long clusterPos = mFile.geClusterPtr(cluster_number);
@@ -357,6 +411,7 @@ public class ZIMReader {
             offsets.add(end_offset);
             int offset_count = (int) ((end_offset - 1) / (extended ? 8 : 4));
             for (int i = 0; i < offset_count - 1; i++) {
+                is.read(buffer);
                 long l = extended ? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
                 offsets.add(l);
             }
@@ -365,14 +420,54 @@ public class ZIMReader {
             // the seek position should be now at the beginning of the first document
             this.blobs = new ArrayList<>();
             for (int i = 0; i < offsets.size() - 1; i++) { // loop until the size - 1 because the last offset is the end of the last document
-                int length = (int) (offsets.get(i + 1) + offsets.get(i)); // yes the maximum document length is 2GB, for now
+                int length = (int) (offsets.get(i + 1) - offsets.get(i)); // yes the maximum document length is 2GB, for now
                 byte[] b = new byte[length];
                 RandomAccessFileZIMInputStream.readFully(is, b);
                 this.blobs.add(b);
             }
         }
+
+        public byte[] getBlob(int i) {
+            return this.blobs.get(i);
+        }
+
+        public void incUsage() {
+            this.usageCounter++;
+        }
+
+        public int getUsage() {
+            return this.usageCounter;
+        }
+
+        public int getSize() {
+            return this.blobs.size();
+        }
+
+        public double getUsageRatio() {
+            return ((double) this.usageCounter) / ((double) this.blobs.size());
+        }
     }
 
+    /*
+    public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {
+
+        // fail fast
+        if (directoryInfo == null) return null;
+        if (directoryInfo.getClass() != ArticleEntry.class) return null;
+
+        // This is now an article, so thus we can cast to ArticleEntry
+        final ArticleEntry article = (ArticleEntry) directoryInfo;
+
+        // Read the cluster
+        Cluster c = getCluster(article.cluster_number);
+
+        // read the blob
+        byte[] blob = c.getBlob(article.blob_number);
+
+        return blob;
+    }
+    */
+    
     public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {
 
         // fail fast
@@ -461,5 +556,4 @@ public class ZIMReader {
 
         return entry;
     }
-
 }