From b9912ff50d1af3a20bb0c5c1d2b19e35442f7db1 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 29 Oct 2023 22:09:24 +0000
Subject: [PATCH 1/2] repaired dockerfiles for aarch64 and armv7

---
 docker/Dockerfile.aarch64 | 52 +++++++++------------------------
 docker/Dockerfile.armv7   | 60 +++++++++++----------------------------
 2 files changed, 30 insertions(+), 82 deletions(-)
diff --git a/docker/Dockerfile.aarch64 b/docker/Dockerfile.aarch64
index 0f536c9a1..2adafcd04 100644
--- a/docker/Dockerfile.aarch64
+++ b/docker/Dockerfile.aarch64
@@ -1,56 +1,30 @@
-# Build a docker image from latest YaCy sources
+# Docker image for YaCy
 
-# Base image : latest Debian stable official jdk 17 image from Docker
-FROM arm64v8/openjdk:17-buster
+# build with
+# docker build -t yacy/yacy_search_server:aarch64-latest -f Dockerfile.aarch64 ../
+
+# run with
+# docker run -d --name yacy -p 8090:8090 -p 8443:8443 -v yacy_data:/opt/yacy_search_server/DATA --log-opt max-size=200m --log-opt max-file=2 yacy/yacy_search_server:aarch64-latest
 
-# Install needed packages not in base image 
-# (curl for sh scripts in /bin, and wkhtmltopdf,imagemagick,xvfb and ghostscript to enable PDF and image snapshot generation)
-RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript && \
-  rm -rf /var/lib/apt/lists/*
+FROM arm64v8/openjdk:17-buster
+LABEL maintainer="Michael Peter Christen <mc@yacy.net>"
 
-# trace java version
+RUN apt-get update && \
+  apt-get install -yq ca-certificates ca-certificates-java curl wkhtmltopdf imagemagick xvfb ghostscript ant git
 RUN java -version
 
-# set current working dir
 WORKDIR /opt
-
-# All in one step to reduce image size growth :
-# - install ant package
-# - Compile with ant
-# - remove unnecessary and size consuming .git directory
-# - remove ant package
-
-# copy sources
 COPY . /opt/yacy_search_server/
-
 RUN rm -rf /opt/yacy_search_server/.git && \
-    apt-get update && \
-	apt-get install -yq ant && \
 	ant compile -f /opt/yacy_search_server/build.xml && \
 	apt-get purge -yq --auto-remove ant && \
 	apt-get clean && \
 	rm -rf /var/lib/apt/lists/*
 
-RUN \
-# Set initial admin password: "yacy" (encoded with custom yacy md5 function net.yacy.cora.order.Digest.encodeMD5Hex())
-# > java -classpath classes net.yacy.cora.order.Digest -strfhex "admin:The YaCy access is limited to administrators. If you don't know the password, you can change it using <yacy-home>/bin/passwd.sh <new-password>:docker"
-	sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a0987a4aba1ec46d63c" /opt/yacy_search_server/defaults/yacy.init && \
-	sed -i "/adminAccountForLocalhost=/c\adminAccountForLocalhost=false" /opt/yacy_search_server/defaults/yacy.init && \
-# Intially enable HTTPS: this is the most secure option for remote administrator authentication
-	sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init && \
-# Create user and group yacy: this user will be used to run YaCy main process
-	adduser --system --group --no-create-home --disabled-password yacy && \
-# Set ownership of yacy install directory to yacy user/group
-	chown yacy:yacy -R /opt/yacy_search_server
-
-# Expose HTTP and HTTPS default ports
 EXPOSE 8090 8443
-
-# Set data volume: yacy data and configuration will persist even after container stop or destruction
 VOLUME ["/opt/yacy_search_server/DATA"]
-
-# Next commands run as yacy as non-root user for improved security
+RUN sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init && \
+    adduser --system --group --no-create-home --disabled-password yacy && \
+    chown -R yacy:yacy /opt/yacy_search_server
 USER yacy
-
-# Start yacy as a foreground process (-f) to display console logs and to wait for yacy process
 CMD ["/bin/sh","/opt/yacy_search_server/startYACY.sh","-f"]
diff --git a/docker/Dockerfile.armv7 b/docker/Dockerfile.armv7
index 7a3134eac..b66212753 100644
--- a/docker/Dockerfile.armv7
+++ b/docker/Dockerfile.armv7
@@ -1,56 +1,30 @@
-# Build a docker image from latest YaCy sources
+# Docker image for YaCy
 
-# Base image : latest Debian stable official jdk 11 image from Docker
-FROM arm32v7/openjdk:11-jdk
+# build with
+# docker build -t yacy/yacy_search_server:armv7-latest -f Dockerfile.armv7 ../
 
-# Install needed packages not in base image 
-# (curl for sh scripts in /bin, and wkhtmltopdf,imagemagick,xvfb and ghostscript to enable PDF and image snapshot generation)
-RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript && \
-  rm -rf /var/lib/apt/lists/*
+# run with
+# docker run -d --name yacy -p 8090:8090 -p 8443:8443 -v yacy_data:/opt/yacy_search_server/DATA --log-opt max-size=200m --log-opt max-file=2 yacy/yacy_search_server:armv7-latest
 
-# trace java version
+FROM arm32v7/debian
+LABEL maintainer="Michael Peter Christen <mc@yacy.net>"
+
+RUN apt-get update && \
+  apt-get install -yq ca-certificates ca-certificates-java curl wkhtmltopdf imagemagick xvfb ghostscript default-jdk ant git
 RUN java -version
 
-# set current working dir
 WORKDIR /opt
-
-# All in one step to reduce image size growth :
-# - install ant package
-# - Compile with ant
-# - remove unnecessary and size consuming .git directory
-# - remove ant package
-
-# copy sources
 COPY . /opt/yacy_search_server/
-
 RUN rm -rf /opt/yacy_search_server/.git && \
-    apt-get update && \
-	apt-get install -yq ant && \
-	ant compile -f /opt/yacy_search_server/build.xml && \
-	apt-get purge -yq --auto-remove ant && \
-	apt-get clean && \
-	rm -rf /var/lib/apt/lists/*
+    ant compile -f /opt/yacy_search_server/build.xml && \
+    apt-get purge -yq --auto-remove ant && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
-RUN \
-# Set initial admin password: "yacy" (encoded with custom yacy md5 function net.yacy.cora.order.Digest.encodeMD5Hex())
-# > java -classpath classes net.yacy.cora.order.Digest -strfhex "admin:The YaCy access is limited to administrators. If you don't know the password, you can change it using <yacy-home>/bin/passwd.sh <new-password>:docker"
-	sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a0987a4aba1ec46d63c" /opt/yacy_search_server/defaults/yacy.init && \
-	sed -i "/adminAccountForLocalhost=/c\adminAccountForLocalhost=false" /opt/yacy_search_server/defaults/yacy.init && \
-# Intially enable HTTPS: this is the most secure option for remote administrator authentication
-	sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init && \
-# Create user and group yacy: this user will be used to run YaCy main process
-	adduser --system --group --no-create-home --disabled-password yacy && \
-# Set ownership of yacy install directory to yacy user/group
-	chown yacy:yacy -R /opt/yacy_search_server
-
-# Expose HTTP and HTTPS default ports
 EXPOSE 8090 8443
-
-# Set data volume: yacy data and configuration will persist even after container stop or destruction
 VOLUME ["/opt/yacy_search_server/DATA"]
-
-# Next commands run as yacy as non-root user for improved security
+RUN sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init && \
+    adduser --system --group --no-create-home --disabled-password yacy && \
+    chown -R yacy:yacy /opt/yacy_search_server
 USER yacy
-
-# Start yacy as a foreground process (-f) to display console logs and to wait for yacy process
 CMD ["/bin/sh","/opt/yacy_search_server/startYACY.sh","-f"]

From 1c0df28bfbf65603219b9b32a2924f97378877c1 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Wed, 1 Nov 2023 18:48:40 +0100
Subject: [PATCH 2/2] added a zim importer that can be used for surrogate
 imports. Can not be used yet because it requires some security additions to
 verify that the given urls actually work.

---
 .../yacy/document/importer/ZimImporter.java   | 306 ++++++++++++++++++
 source/org/openzim/ZIMFile.java               |   2 +-
 source/org/openzim/ZIMReader.java             |  38 ++-
 3 files changed, 339 insertions(+), 7 deletions(-)
 create mode 100644 source/net/yacy/document/importer/ZimImporter.java

diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java
new file mode 100644
index 000000000..2b1f5551c
--- /dev/null
+++ b/source/net/yacy/document/importer/ZimImporter.java
@@ -0,0 +1,306 @@
+/**
+ * ZimImporter.java
+ * (C) 2023 by Michael Peter Christen @orbiter
+ *
+ * This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ * LICENSE
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package net.yacy.document.importer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+import java.util.TreeMap;
+
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.protocol.ResponseHeader;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.retrieval.Request;
+import net.yacy.crawler.retrieval.Response;
+import net.yacy.document.TextParser;
+import net.yacy.search.Switchboard;
+
+import org.openzim.ZIMFile;
+import org.openzim.ZIMReader;
+import org.openzim.ZIMReader.ArticleEntry;
+import org.openzim.ZIMReader.DirectoryEntry;
+
+/**
+ * ZIM importer
+ * can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/
+ * These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains.
+ * These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them.
+ * For that we have a guessing function, but we must check if the guessing was correct by testing some of the given
+ * URLs against the actual internet-hosted document. Only if that check succeeds we should import the files.
+ * In all other cases the import should work as well but should also only be done in a non-p2p environment to prevent
+ * that such links are shared.
+ */
+public class ZimImporter extends Thread implements Importer {
+
+    static public ZimImporter job;
+
+    private ZIMFile file;
+    private ZIMReader reader;
+    private String path; 
+    private String guessedSource;
+
+    private int recordCnt;
+    private long startTime;
+    private final long sourceSize;
+    private long consumed;
+    private boolean abort = false;
+
+    public ZimImporter(String path) throws IOException {
+       super("ZimImporter - from file " + path);
+       this.path = path;
+       this.file = new ZIMFile(this.path); // this will read already some of the metadata and could consume some time
+       this.sourceSize = this.file.length();
+    }
+
+    @Override
+    public void run() {
+        job = this;
+        this.startTime = System.currentTimeMillis();
+        try {
+            this.reader = new ZIMReader(this.file);
+            this.guessedSource = getSource(this.reader);
+
+            for (int i = 0; i < this.file.header_entryCount; i++) {
+                if (this.abort) break;
+                DirectoryEntry de = this.reader.getDirectoryInfo(i);
+                if (!(de instanceof ZIMReader.ArticleEntry)) continue;
+                ArticleEntry ae = (ArticleEntry) de;
+
+                // check url
+                String guessedUrl = guessURL(this.guessedSource, de);
+                assert guessedUrl.startsWith("http");
+
+                // check availability of text parser
+                String mimeType = this.file.getMimeType(ae.mimetype);
+                if (TextParser.supportsMime(mimeType) != null) continue;
+
+                // read the content
+                byte[] b = this.reader.getArticleData(ae);
+
+                // create artificial request and response headers for the indexer
+                RequestHeader requestHeader = new RequestHeader();
+                ResponseHeader responseHeader = new ResponseHeader(200);
+                final Request request = new Request(new DigestURL(guessedUrl), null);
+                final Response response = new Response(
+                        request,
+                        requestHeader,
+                        responseHeader,
+                        Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
+                        false,
+                        b
+                );
+
+                // throw this to the indexer
+                String error = Switchboard.getSwitchboard().toIndexer(response);
+                if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
+                this.recordCnt++;
+            }
+        } catch (IOException e) {
+            ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage());
+        }
+        ConcurrentLog.info("ZimImporter", "Indexed " + this.recordCnt + " documents");
+        job = null;
+    }
+
+    public void quit() {
+        this.abort = true;
+    }
+
+    @Override
+    public String source() {
+        return this.path;
+    }
+
+    @Override
+    public int count() {
+        return this.recordCnt;
+    }
+
+    @Override
+    public int speed() {
+        if (this.recordCnt == 0) return 0;
+        return (int) (this.recordCnt / Math.max(0L, runningTime() ));
+    }
+
+    @Override
+    public long runningTime() {
+        return (System.currentTimeMillis() - this.startTime) / 1000L;
+    }
+
+    @Override
+    public long remainingTime() {
+        if (this.consumed == 0) {
+            return 0;
+        }
+        long speed = this.consumed / runningTime();
+        return (this.sourceSize - this.consumed) / speed;
+    }
+
+    @Override
+    public String status() {
+        return "";
+    }
+
+    public static String guessDomainName(String fileName) {
+        if (fileName == null || fileName.isEmpty()) {
+            return null; // Handle null or empty input
+        }
+
+        String[] parts = fileName.split("_");
+        if (parts.length == 0) {
+            return null;
+        }
+        String firstPart = parts[0];
+
+        // Handling special cases where the domain name might not be obvious
+        // These are based on your provided list and can be expanded as needed
+        switch (firstPart) {
+            case "100r-off-the-grid":
+                return "100resilientcities.org";
+            case "armypubs":
+                return "armypubs.army.mil";
+            case "artofproblemsolving":
+                return "artofproblemsolving.com";
+            case "based":
+                return "based.cooking";
+            case "booksdash":
+                return "booksdash.com";
+            case "coopmaths":
+                return "coopmaths.fr";
+            case "fas-military-medicine":
+                return "fas.org";
+            case "fonts":
+                return "fonts.google.com";
+            case "gutenberg":
+                return "gutenberg.org";
+            case "ifixit":
+                return "ifixit.com";
+            case "lesfondamentaux":
+                return "reseau-canope.fr";
+            case "lowtechmagazine":
+                return "lowtechmagazine.com";
+            case "mutopiaproject":
+                return "mutopiaproject.org";
+            case "openstreetmap-wiki":
+                return "wiki.openstreetmap.org";
+            case "opentextbooks":
+                return "opentextbooks.org";
+            case "phet":
+                return "phet.colorado.edu";
+            case "practical_action":
+                return "practicalaction.org";
+            case "rapsberry_pi_docs":
+                return "raspberrypi.org";
+            case "ted":
+                return "ted.com";
+            case "vikidia":
+                return "vikidia.org";
+            case "westeros":
+                return "westeros.org";
+            case "wikipedia":
+                return parts[1] + ".wikipedia.org/wiki";
+            case "www.ready.gov":
+                return "ready.gov";
+        }
+
+        // Handling domain patterns
+        if (firstPart.contains(".stackexchange.com")) {
+            return firstPart;
+        } else if (firstPart.endsWith(".com") || firstPart.endsWith(".org") || firstPart.endsWith(".de") || 
+                   firstPart.endsWith(".fr") || firstPart.endsWith(".pt") || firstPart.endsWith(".it") || 
+                   firstPart.endsWith(".ja") || firstPart.endsWith(".es") || firstPart.endsWith(".eo")) {
+            return firstPart;
+        } else if (firstPart.contains("-")) {
+            return firstPart.substring(0, firstPart.indexOf("-"));
+        }
+
+        // Additional general domain extraction logic
+        if (firstPart.contains(".")) {
+            int lastDotIndex = firstPart.lastIndexOf('.');
+            if (lastDotIndex > 0 && lastDotIndex < firstPart.length() - 1) {
+                // Extract up to the next character beyond the TLD, to support TLDs of variable length
+                int endIndex = firstPart.indexOf('.', lastDotIndex + 1);
+                if (endIndex == -1) {
+                    endIndex = firstPart.length();
+                }
+                return firstPart.substring(0, endIndex);
+            }
+        }
+
+        // Default return if none of the above conditions meet
+        return null;
+    }
+
+    public static String getSource(ZIMReader r) throws IOException {
+        String source = r.getMetadata("Source");
+        if (source != null) return source;
+        source = "https://" + guessDomainName(r.getZIMFile().getName()) + "/";
+        return source;
+    }
+
+    public static String guessURL(String guessedSource, DirectoryEntry de) {
+        String url = de.url;
+        if (url.equals("Main_Page")) url = "";
+        return guessedSource + url;
+    }
+
+    public static void main(String[] args) {
+        // zim file import test
+        // will test mostly if domain names are included in zim file urls
+        String zimFilesPath = args[0];
+        File zimFiles = new File(zimFilesPath);
+
+        // make ordered file list; order by file size (start with smallest)
+        String[] filelist = zimFiles.list();
+        Map<Long, File> orderedFileMap = new TreeMap<>();
+        for (int i = 0; i < filelist.length; i++) {
+            if (!filelist[i].endsWith(".zim")) continue;
+            File f = new File(zimFiles, filelist[i]);
+            orderedFileMap.put(f.length() * 1000 + i, f);
+        }
+
+        Collection<File> orderedFiles = orderedFileMap.values();
+        for (File f: orderedFiles) {
+            try {
+                ZIMFile z = new ZIMFile(f.getAbsolutePath());
+                ZIMReader r = new ZIMReader(z);
+                DirectoryEntry de = r.getMainDirectoryEntry();
+                System.out.println("ZIM file:  " + f.getAbsolutePath());
+                for (String key: ZIMReader.METADATA_KEYS) {String s = r.getMetadata(key); if (s != null) System.out.println("Metadata " + key + ": " + s);};
+                System.out.println("Namespace: " + de.namespace);
+                System.out.println("Title:     " + de.title);
+                System.out.println("URL:       " + de.url);
+                System.out.println("guessed domain: " + guessDomainName(f.getName()));
+                String source = getSource(r);
+                System.out.println("guessed Source: " + source);
+                System.out.println("guessed main article: " + guessURL(source, de));
+                System.out.println();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+    }
+}
diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java
index dd209b5e9..6381294df 100644
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@@ -106,7 +106,7 @@ public class ZIMFile extends File {
                 break;
             }
             String mimeType = mimeBuffer.toString();
-            System.out.println(mimeType);
+            //System.out.println(mimeType);
             mList.add(mimeType);
         }
         this.mimeTypeList = mList.toArray(new String[mList.size()]);
diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java
index 8d773d473..70dde16bc 100644
--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@@ -20,6 +20,7 @@ package org.openzim;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 
 import org.tukaani.xz.SingleXZInputStream;
 import com.github.luben.zstd.ZstdInputStream;
@@ -39,6 +40,11 @@ import com.github.luben.zstd.ZstdInputStream;
  */
 public class ZIMReader {
 
+    public final static String[] METADATA_KEYS = new String[] {
+            "Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription",
+            "Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper"
+    };
+
     private final ZIMFile mFile;
 
     public static abstract class DirectoryEntry {
@@ -48,13 +54,13 @@ public class ZIMReader {
         public final int cluster_number;
         public final String url;
         public final String title;
-        public final long urlListindex;
+        public final int urlListindex;
 
         public DirectoryEntry(
                 final int mimeType, final char namespace,
                 final int cluster_number,
                 final String url, final String title,
-                final long index) {
+                final int index) {
             this.mimetype = mimeType;
             this.namespace = namespace;
             this.cluster_number = cluster_number;
@@ -74,7 +80,7 @@ public class ZIMReader {
                 final int mimeType, final char namespace,
                 final int cluster_number, final int blob_number,
                 final String url, final String title,
-                final long urlListindex) {
+                final int urlListindex) {
             super(mimeType, namespace, cluster_number, url, title, urlListindex);
             this.cluster_number = cluster_number;
             this.blob_number = blob_number;
@@ -84,11 +90,11 @@ public class ZIMReader {
 
     public static class RedirectEntry extends DirectoryEntry {
 
-        public final long redirect_index;
+        public final int redirect_index;
 
         public RedirectEntry(final int mimeType, final char namespace,
-                final long redirect_index, final String url, final String title,
-                final long urlListindex) {
+                final int redirect_index, final String url, final String title,
+                final int urlListindex) {
             super(mimeType, namespace, 0, url, title, urlListindex);
             this.redirect_index = redirect_index;
         }
@@ -103,6 +109,25 @@ public class ZIMReader {
         return this.mFile;
     }
 
+    public final String getMetadata(String key) throws IOException {
+        DirectoryEntry de = getDirectoryInfo('M', key);
+        if (de == null) return null; // metadata not found; that would be normal
+        byte[] val = getArticleData(de);
+        if (val == null) return null; // article data not found: that is not normal
+        if (val.length == 0) return null; // that empty string is a proper value, however, not usable for a client
+        return new String(val, StandardCharsets.UTF_8);
+    }
+
+    public DirectoryEntry getMainDirectoryEntry() throws IOException {
+        DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage);
+        if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) {
+            // resolve redirect to get the actual main page
+            int redirect = ((RedirectEntry) de).redirect_index;
+            de = getDirectoryInfo(redirect);
+        }
+        return de;
+    }
+
     public String getURLByURLOrder(final int entryNumber) throws IOException {
 
         // The position of URL i
@@ -283,6 +308,7 @@ public class ZIMReader {
         is.read(buffer);
         long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
         long blob_size = offset2 - offset1;
+        if (blob_size == 0) return new byte[0]; // skip the skipping to get to a zero-length object (they exist!)
         byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT
         // we must do two skip steps: first to the end of the offset list and second to the start of the blob
         // - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset