From 9c8fb979850674c2d16c08155d670a56abf8aaeb Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 29 Oct 2023 00:43:12 +0200 Subject: [PATCH] introduced url list and title list caching and enhanced input stream performance in ZIM reader --- .../RandomAccessFileZIMInputStream.java | 37 ++++- source/org/openzim/ZIMFile.java | 64 ++++++--- source/org/openzim/ZIMReader.java | 136 +++++++----------- 3 files changed, 129 insertions(+), 108 deletions(-) diff --git a/source/org/openzim/RandomAccessFileZIMInputStream.java b/source/org/openzim/RandomAccessFileZIMInputStream.java index 3b4f4d1ed..84cb2ada6 100644 --- a/source/org/openzim/RandomAccessFileZIMInputStream.java +++ b/source/org/openzim/RandomAccessFileZIMInputStream.java @@ -65,12 +65,18 @@ public class RandomAccessFileZIMInputStream extends InputStream { return ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)); } - public static int toFourLittleEndianInteger(final byte[] buffer) { // TODO: make private + public static int toFourLittleEndianInteger(final byte[] buffer) { return ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)); } + public static int toFourLittleEndianInteger(final byte[] buffer, int pos) { + return + ((buffer[pos ] & 0xFF) | ((buffer[pos + 1] & 0xFF) << 8) + | ((buffer[pos + 2] & 0xFF) << 16) | ((buffer[pos + 3] & 0xFF) << 24)); + } + public static long toEightLittleEndianLong(final byte[] buffer) { return // cast to long required otherwise this is again an integer ((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8) @@ -79,6 +85,14 @@ public class RandomAccessFileZIMInputStream extends InputStream { | ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56)); } + public static long toEightLittleEndianLong(final byte[] buffer, int pos) { + return // cast to long required otherwise this is again an integer + ((long)(buffer[pos ] & 0xFF) | ((long)(buffer[pos + 1] & 0xFF) << 8) + | ((long)(buffer[pos + 2] & 0xFF) << 16) | ((long)(buffer[pos + 3] & 0xFF) << 24) + | ((long)(buffer[pos + 4] & 0xFF) << 32) | ((long)(buffer[pos + 5] & 0xFF) << 40) + | ((long)(buffer[pos + 6] & 0xFF) << 48) | ((long)(buffer[pos + 7] & 0xFF) << 56)); + } + public static void skipFully(final InputStream stream, final long bytes) throws IOException { for (long i = stream.skip(bytes); i < bytes; i += stream.skip(bytes - i)); } @@ -99,6 +113,27 @@ public class RandomAccessFileZIMInputStream extends InputStream { return this.mRAFReader.read(); } + @Override + public int read(byte b[], int off, int len) throws IOException { + return this.mRAFReader.read(b, off, len); + } + + public static byte[] readFully(final InputStream is, final int len) throws IOException { + byte[] b = new byte[len]; + int c = 0; + while (c < len) { + c = c + is.read(b, c, len - c); + } + return b; + } + + public static void readFully(final InputStream is, final byte[] b) throws IOException { + int c = 0; + while (c < b.length) { + c = c + is.read(b, c, b.length - c); + } + } + public RandomAccessFile getRandomAccessFile() { return this.mRAFReader; } diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index 24d9ace65..dd209b5e9 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -54,8 +54,13 @@ public class ZIMFile extends File { public final int header_layoutPage; public final long header_checksumPos; + // content handle + public final RandomAccessFileZIMInputStream mReader; + // content cache - public final List mimeList; + private final String[] mimeTypeList; + private final byte[] urlPtrListBlob; + private final byte[] titlePtrListBlob; public ZIMFile(final String path) throws IOException { super(path); @@ -67,34 +72,34 @@ public class ZIMFile extends File { } // The reader that will be used to read contents from the file - final RandomAccessFileZIMInputStream reader = new RandomAccessFileZIMInputStream(new RandomAccessFile(this, "r")); + this.mReader = new RandomAccessFileZIMInputStream(new RandomAccessFile(this, "r")); // Read the contents of the header - this.header_magicNumber = reader.readFourLittleEndianBytesInt(); // 4 - this.header_majorVersion = reader.readTwoLittleEndianBytesInt(); // 2 - this.header_minorVersion = reader.readTwoLittleEndianBytesInt(); // 4 - RandomAccessFileZIMInputStream.skipFully(reader, 16); // skip the uuid, this is not used - this.header_entryCount = reader.readFourLittleEndianBytesInt(); // 4 - this.header_clusterCount = reader.readFourLittleEndianBytesInt(); // 4 - this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(); // 8 - this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(); // 8 - this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(); // 8 - this.header_mimeListPos = reader.readEightLittleEndianBytesLong(); // 8 - this.header_mainPage = reader.readFourLittleEndianBytesInt(); // 4 - this.header_layoutPage = reader.readFourLittleEndianBytesInt(); // 4 - this.header_checksumPos = reader.readEightLittleEndianBytesLong(); // 8 [FIX!] + this.header_magicNumber = mReader.readFourLittleEndianBytesInt(); // 4 + this.header_majorVersion = mReader.readTwoLittleEndianBytesInt(); // 2 + this.header_minorVersion = mReader.readTwoLittleEndianBytesInt(); // 4 + RandomAccessFileZIMInputStream.skipFully(mReader, 16); // skip the uuid, this is not used + this.header_entryCount = mReader.readFourLittleEndianBytesInt(); // 4 + this.header_clusterCount = mReader.readFourLittleEndianBytesInt(); // 4 + this.header_urlPtrPos = mReader.readEightLittleEndianBytesLong(); // 8 + this.header_titlePtrPos = mReader.readEightLittleEndianBytesLong(); // 8 + this.header_clusterPtrPos = mReader.readEightLittleEndianBytesLong(); // 8 + this.header_mimeListPos = mReader.readEightLittleEndianBytesLong(); // 8 + this.header_mainPage = mReader.readFourLittleEndianBytesInt(); // 4 + this.header_layoutPage = mReader.readFourLittleEndianBytesInt(); // 4 + this.header_checksumPos = mReader.readEightLittleEndianBytesLong(); // 8 [FIX!] // Initialise the MIMETypeList int len = 0; StringBuffer mimeBuffer = null; - this.mimeList = new ArrayList<>(); + List mList = new ArrayList<>(); while (true) { - int b = reader.read(); // read only one byte to check if this is a zero + int b = mReader.read(); // read only one byte to check if this is a zero len = 0; mimeBuffer = new StringBuffer(); while (b != '\0') { mimeBuffer.append((char) b); - b = reader.read(); + b = mReader.read(); len++; } if (len == 0) { @@ -102,9 +107,30 @@ public class ZIMFile extends File { } String mimeType = mimeBuffer.toString(); System.out.println(mimeType); - this.mimeList.add(mimeType); + mList.add(mimeType); } + this.mimeTypeList = mList.toArray(new String[mList.size()]); + + // Initialize the Url Pointer List + this.urlPtrListBlob = new byte[this.header_entryCount * 8]; + mReader.seek(this.header_urlPtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); + + // Initialize the Title Pointer List + this.titlePtrListBlob = new byte[this.header_entryCount * 4]; + mReader.seek(this.header_titlePtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); + } + public final String getMimeType(int idx) { + return this.mimeTypeList[idx]; } + public final long getURLPtr(final int idx) { + return RandomAccessFileZIMInputStream.toEightLittleEndianLong(this.urlPtrListBlob, idx * 8); + } + + public final int getTitlePtr(final int idx) { + return RandomAccessFileZIMInputStream.toFourLittleEndianInteger(this.titlePtrListBlob, idx * 4); + } } diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index cf11d6342..8d773d473 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -18,12 +18,8 @@ package org.openzim; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import java.io.RandomAccessFile; -import java.util.ArrayList; -import java.util.List; import org.tukaani.xz.SingleXZInputStream; import com.github.luben.zstd.ZstdInputStream; @@ -44,7 +40,6 @@ import com.github.luben.zstd.ZstdInputStream; public class ZIMReader { private final ZIMFile mFile; - private RandomAccessFileZIMInputStream mReader; public static abstract class DirectoryEntry { @@ -102,116 +97,83 @@ public class ZIMReader { public ZIMReader(final ZIMFile file) { this.mFile = file; - try { - this.mReader = new RandomAccessFileZIMInputStream(new RandomAccessFile(this.mFile, "r")); - } catch (final FileNotFoundException e) { - e.printStackTrace(); - } } public ZIMFile getZIMFile() { return this.mFile; } - public String getURLByURLOrder(int entryNumber) throws IOException { - - // Move to the spot where URL's are listed - this.mReader.seek(this.mFile.header_urlPtrPos + 8L * entryNumber); + public String getURLByURLOrder(final int entryNumber) throws IOException { // The position of URL i - long pos = this.mReader.readEightLittleEndianBytesLong(); + long pos = this.mFile.getURLPtr(entryNumber); // Move to the position of URL i - this.mReader.seek(pos); + this.mFile.mReader.seek(pos); // Article or Redirect entry? - int mimeType = this.mReader.readTwoLittleEndianBytesInt(); + int mimeType = this.mFile.mReader.readTwoLittleEndianBytesInt(); if (mimeType == 65535) { - this.mReader.seek(pos + 12); - return this.mReader.readZeroTerminatedString(); + this.mFile.mReader.seek(pos + 12); + return this.mFile.mReader.readZeroTerminatedString(); } else { - this.mReader.seek(pos + 16); - return this.mReader.readZeroTerminatedString(); + this.mFile.mReader.seek(pos + 16); + return this.mFile.mReader.readZeroTerminatedString(); } } - public String getURLByTitleOrder(int entryNumber) throws IOException { - - // Move to the spot where URL's are listed - this.mReader.seek(this.mFile.header_titlePtrPos + 8L * entryNumber); + public String getURLByTitleOrder(final int entryNumber) throws IOException { // The articleNumber of the position of URL i - int articleNumber = this.mReader.readFourLittleEndianBytesInt(); - - this.mReader.seek(this.mFile.header_urlPtrPos + (8L * (articleNumber))); - - // The position of URL i - long pos = this.mReader.readEightLittleEndianBytesLong(); - this.mReader.seek(pos); + int articleNumber = this.mFile.getTitlePtr(entryNumber); + long pos = this.mFile.getURLPtr(articleNumber); + this.mFile.mReader.seek(pos); // Article or Redirect entry? - int mimeType = this.mReader.readTwoLittleEndianBytesInt(); + int mimeType = this.mFile.mReader.readTwoLittleEndianBytesInt(); if (mimeType == 65535) { - this.mReader.seek(pos + 12); - return this.mReader.readZeroTerminatedString(); + this.mFile.mReader.seek(pos + 12); + return this.mFile.mReader.readZeroTerminatedString(); } else { - this.mReader.seek(pos + 16); - return this.mReader.readZeroTerminatedString(); + this.mFile.mReader.seek(pos + 16); + return this.mFile.mReader.readZeroTerminatedString(); } } - // position must be the seek position for the title in the Title Pointer List - private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException { - - // At the appropriate position in the titlePtrPos - this.mReader.seek(position); + public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException { // Get value of article at index - int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(); - - // Move to the position in urlPtrPos - this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer); + int pointer_to_the_URL_pointer = this.mFile.getTitlePtr(entryNumber); // Get value of article in urlPtrPos - long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(); + long pointer_to_the_directory_entry = this.mFile.getURLPtr(pointer_to_the_URL_pointer); // Go to the location of the directory entry - this.mReader.seek(pointer_to_the_directory_entry); + this.mFile.mReader.seek(pointer_to_the_directory_entry); // read the Content Entry - final int type = this.mReader.readTwoLittleEndianBytesInt(); // 2, 0xffff for redirect - this.mReader.read(); // 1, ignore, parameter length not used - final char namespace = (char) this.mReader.read(); // 1 - this.mReader.readFourLittleEndianBytesInt(); // 4, ignore, revision not used + final int type = this.mFile.mReader.readTwoLittleEndianBytesInt(); // 2, 0xffff for redirect + this.mFile.mReader.read(); // 1, ignore, parameter length not used + final char namespace = (char) this.mFile.mReader.read(); // 1 + this.mFile.mReader.readFourLittleEndianBytesInt(); // 4, ignore, revision not used // Article or Redirect entry if (type == 65535) { - final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(); - final String url = this.mReader.readZeroTerminatedString(); - String title = this.mReader.readZeroTerminatedString(); + final int redirectIndex = this.mFile.mReader.readFourLittleEndianBytesInt(); + final String url = this.mFile.mReader.readZeroTerminatedString(); + String title = this.mFile.mReader.readZeroTerminatedString(); title = title.equals("") ? url : title; - return new RedirectEntry(type, namespace, redirectIndex, - url, title, (position - this.mFile.header_urlPtrPos) / 8); + return new RedirectEntry(type, namespace, redirectIndex, url, title, entryNumber); } else { - final int cluster_number = this.mReader.readFourLittleEndianBytesInt(); // 4 - final int blob_number = this.mReader.readFourLittleEndianBytesInt(); // 4 - final String url = this.mReader.readZeroTerminatedString(); // zero terminated - String title = this.mReader.readZeroTerminatedString(); // zero terminated + final int cluster_number = this.mFile.mReader.readFourLittleEndianBytesInt(); // 4 + final int blob_number = this.mFile.mReader.readFourLittleEndianBytesInt(); // 4 + final String url = this.mFile.mReader.readZeroTerminatedString(); // zero terminated + String title = this.mFile.mReader.readZeroTerminatedString(); // zero terminated title = title.equals("") ? url : title; - - return new ArticleEntry( - type, namespace, - cluster_number, blob_number, - url, title, (position - this.mFile.header_urlPtrPos) / 8); + return new ArticleEntry(type, namespace, cluster_number, blob_number, url, title, entryNumber); } - - } - - public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException { - if (entryNumber >= this.mFile.header_entryCount) throw new IOException("entryNumber exceeds entryCount"); - return getDirectoryInfoAtTitlePosition(this.mFile.header_titlePtrPos + 4 * entryNumber); } // Gives the minimum required information needed for the given articleName @@ -221,23 +183,21 @@ public class ZIMReader { DirectoryEntry entry; String cmpStr; final int numberOfArticles = this.mFile.header_entryCount; - long beg = this.mFile.header_titlePtrPos, end = beg + (numberOfArticles * 4), mid; + int beg = 0, end = numberOfArticles, mid; articleName = namespace + "/" + articleName; while (beg <= end) { - mid = beg + 4 * (((end - beg) / 4) / 2); - entry = getDirectoryInfoAtTitlePosition(mid); + mid = beg + ((end - beg) / 2); + entry = getDirectoryInfo(mid); if (entry == null) { return null; } - cmpStr = entry.namespace + "/" + entry.url; + cmpStr = entry.namespace + "/" + entry.title; if (articleName.compareTo(cmpStr) < 0) { - end = mid - 4; - + end = mid - 1; } else if (articleName.compareTo(cmpStr) > 0) { - beg = mid + 4; - + beg = mid + 1; } else { return entry; } @@ -256,22 +216,22 @@ public class ZIMReader { final ArticleEntry article = (ArticleEntry) directoryInfo; // Move to the cluster entry in the clusterPtrPos - this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8L); + this.mFile.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8L); // Read the location of the cluster - final long clusterPos = this.mReader.readEightLittleEndianBytesLong(); + final long clusterPos = this.mFile.mReader.readEightLittleEndianBytesLong(); // Move to the cluster - this.mReader.seek(clusterPos); + this.mFile.mReader.seek(clusterPos); // Read the first byte, for compression information - final int compressionType = this.mReader.read(); + final int compressionType = this.mFile.mReader.read(); // Check the compression type that was read // type = 1 uncompressed if (compressionType <= 1 || compressionType == 8 || compressionType == 9) { boolean extended = compressionType > 1; - return readClusterEntry(this.mReader, article.blob_number, extended); + return readClusterEntry(this.mFile.mReader, article.blob_number, extended); } // 2 for zlib and 3 for bzip2 (removed) @@ -279,14 +239,14 @@ public class ZIMReader { if (compressionType == 4 || compressionType == 12) { boolean extended = compressionType == 12; // Create a dictionary with size 40MiB, the zimlib uses this size while creating - SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 41943040); + SingleXZInputStream xzReader= new SingleXZInputStream(this.mFile.mReader, 41943040); return readClusterEntry(xzReader, article.blob_number, extended); } // Zstandard compressed data if (compressionType == 5 || compressionType == 13) { boolean extended = compressionType == 13; - ZstdInputStream zReader = new ZstdInputStream(this.mReader); + ZstdInputStream zReader = new ZstdInputStream(this.mFile.mReader); return readClusterEntry(zReader, article.blob_number, extended); } @@ -332,7 +292,7 @@ public class ZIMReader { // - the full skip length is 4 * (numberOfBlobs1 - (article.blob_number + 2)) + offset1 - 4 * numberOfBlobs1 // = offset1 - 4 * (article.blob_number + 2) RandomAccessFileZIMInputStream.skipFully(is, (offset1 - (extended ? 8 : 4) * (blob_number + 2))); - is.read(entry, 0, entry.length); + RandomAccessFileZIMInputStream.readFully(is, entry); return entry; }