From c4082c4ff217c461b913b4bcd424cd14a7c4a831 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 28 Oct 2023 08:56:58 +0200 Subject: [PATCH] refactoring of ZIM reader, simplification, removed unnecessary code --- .../RandomAcessFileZIMInputStream.java | 81 ++++++------ source/org/openzim/Utilities.java | 76 ----------- source/org/openzim/ZIMFile.java | 36 +++--- source/org/openzim/ZIMReader.java | 122 ++++++------------ source/org/openzim/ZIMTest.java | 5 +- 5 files changed, 98 insertions(+), 222 deletions(-) delete mode 100644 source/org/openzim/Utilities.java diff --git a/source/org/openzim/RandomAcessFileZIMInputStream.java b/source/org/openzim/RandomAcessFileZIMInputStream.java index cb6cdb093..7f34b7411 100644 --- a/source/org/openzim/RandomAcessFileZIMInputStream.java +++ b/source/org/openzim/RandomAcessFileZIMInputStream.java @@ -29,78 +29,69 @@ import java.io.RandomAccessFile; * * @author Arunesh Mathur * @author Michael Christen - * bugfix to long parsing (return value was int) + * bugfix to long parsing (return value was int), + * moved conditions for exceptions to asserts, + * refactoring and merge with Utilities */ - public class RandomAcessFileZIMInputStream extends InputStream { private final RandomAccessFile mRAFReader; - private long mMarked = -1; + private final byte[] buffer2 = new byte[2]; + private final byte[] buffer4 = new byte[4]; + private final byte[] buffer8 = new byte[8]; public RandomAcessFileZIMInputStream(final RandomAccessFile reader) { this.mRAFReader = reader; } - // TODO: Remove the parameter buffer - public int readTwoLittleEndianBytesInt(final byte[] buffer) throws IOException { - if (buffer.length < 2) { - throw new OutOfMemoryError("buffer too small"); - } else { - this.mRAFReader.read(buffer, 0, 2); - return Utilities.toTwoLittleEndianInteger(buffer); - } + public int readTwoLittleEndianBytesInt() throws IOException { + this.mRAFReader.read(buffer2, 0, 2); + return toTwoLittleEndianInteger(buffer2); } - // TODO: Remove the parameter buffer - public int readFourLittleEndianBytesInt(final byte[] buffer) throws IOException { - if (buffer.length < 4) { - throw new OutOfMemoryError("buffer too small"); - } else { - this.mRAFReader.read(buffer, 0, 4); - return Utilities.toFourLittleEndianInteger(buffer); - } + public int readFourLittleEndianBytesInt() throws IOException { + this.mRAFReader.read(buffer4, 0, 4); + return toFourLittleEndianInteger(buffer4); } - // TODO: Remove the parameter buffer - public long readEightLittleEndianBytesLong(final byte[] buffer) - throws IOException { - if (buffer.length < 8) { - throw new OutOfMemoryError("buffer too small"); - } else { - this.mRAFReader.read(buffer, 0, 8); - return Utilities.toEightLittleEndianLong(buffer); - } + public long readEightLittleEndianBytesLong() throws IOException { + this.mRAFReader.read(buffer8, 0, 8); + return toEightLittleEndianLong(buffer8); } - // TODO: Remove the parameter buffer - public long readSixteenLittleEndianBytesLong(final byte[] buffer) - throws IOException { - if (buffer.length < 16) { - throw new OutOfMemoryError("buffer too small"); - } else { - this.mRAFReader.read(buffer, 0, 16); - return Utilities.toSixteenLittleEndianLong(buffer); - } + private static int toTwoLittleEndianInteger(final byte[] buffer) { + return ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)); + } + + public static int toFourLittleEndianInteger(final byte[] buffer) { // TODO: make private + return + ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) + | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)); + } + + private static long toEightLittleEndianLong(final byte[] buffer) { + return // cast to long required otherwise this is again an integer + ((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8) + | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24) + | ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40) + | ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56)); + } + + public static void skipFully(final InputStream stream, final long bytes) throws IOException { + for (long i = stream.skip(bytes); i < bytes; i += stream.skip(bytes - i)); } // Reads characters from the current position into a String and stops when a // '\0' is encountered public String readZeroTerminatedString() throws IOException { - final StringBuffer sb = new StringBuffer(); - /* - * int i; byte[] buffer = new byte[100]; while (true) { - * mRAFReader.read(buffer); for (i = 0; i < buffer.length; i++) { if - * (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i - * != buffer.length) break; } return sb.toString(); - */ + final StringBuilder sb = new StringBuilder(); int b = this.mRAFReader.read(); while (b != '\0') { sb.append((char) b); b = this.mRAFReader.read(); } return sb.toString(); - } @Override diff --git a/source/org/openzim/Utilities.java b/source/org/openzim/Utilities.java deleted file mode 100644 index 28572839b..000000000 --- a/source/org/openzim/Utilities.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2011 Arunesh Mathur - * - * This file is a part of zimreader-java. - * - * zimreader-java is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License version 3.0 as - * published by the Free Software Foundation. - * - * zimreader-java is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with zimreader-java. If not, see . - */ - - -package org.openzim; - -import java.io.IOException; -import java.io.InputStream; - -/** - * @author Arunesh Mathur - * A ZIM file implementation that stores the Header and the MIMETypeList - * - * @author Michael Christen - * int/long bugfix (did reading of long values with int variables, causing negative offsets) - */ -public class Utilities { - - public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException { - if (buffer.length < 2) { - throw new OutOfMemoryError("buffer too small"); - } else { - final int result = - ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)); - return result; - } - } - - public static int toFourLittleEndianInteger(final byte[] buffer) throws IOException { - if (buffer.length < 4) { - throw new OutOfMemoryError("buffer too small"); - } else { - final int result = - ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) - | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)); - return result; - } - } - - public static long toEightLittleEndianLong(final byte[] buffer) throws IOException { - if (buffer.length < 8) { - throw new OutOfMemoryError("buffer too small"); - } else { - final long result = // cast to long required otherwise this is again an integer - ((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8) - | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24) - | ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40) - | ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56)); - return result; - } - } - - public static long toSixteenLittleEndianLong(final byte[] buffer) throws IOException { - return toEightLittleEndianLong(buffer); // there are no sixten bytes long values - } - - public static void skipFully(final InputStream stream, final long bytes) throws IOException { - for (long i = stream.skip(bytes); i < bytes; i += stream.skip(bytes - i)); - } - -} diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index 56e84ad17..e712326dc 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -44,7 +44,6 @@ public class ZIMFile extends File { public final int header_magicNumber; public final int header_majorVersion; public final int header_minorVersion; - public final long header_uuid; public final int header_entryCount; public final int header_clusterCount; public final long header_urlPtrPos; @@ -69,34 +68,33 @@ public class ZIMFile extends File { // The reader that will be used to read contents from the file final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this, "r")); - final byte[] buffer = new byte[16]; // Read the contents of the header - this.header_magicNumber = reader.readFourLittleEndianBytesInt(buffer); // 4 - this.header_majorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 2 - this.header_minorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 4 - this.header_uuid = reader.readSixteenLittleEndianBytesLong(buffer); // 16 - this.header_entryCount = reader.readFourLittleEndianBytesInt(buffer); // 4 - this.header_clusterCount = reader.readFourLittleEndianBytesInt(buffer); // 4 - this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8 - this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8 - this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8 - this.header_mimeListPos = reader.readEightLittleEndianBytesLong(buffer); // 8 - this.header_mainPage = reader.readFourLittleEndianBytesInt(buffer); // 4 - this.header_layoutPage = reader.readFourLittleEndianBytesInt(buffer); // 4 - this.header_checksumPos = reader.readEightLittleEndianBytesLong(buffer); // 8 [FIX!] + this.header_magicNumber = reader.readFourLittleEndianBytesInt(); // 4 + this.header_majorVersion = reader.readTwoLittleEndianBytesInt(); // 2 + this.header_minorVersion = reader.readTwoLittleEndianBytesInt(); // 4 + RandomAcessFileZIMInputStream.skipFully(reader, 16); // skip the uuid, this is not used + this.header_entryCount = reader.readFourLittleEndianBytesInt(); // 4 + this.header_clusterCount = reader.readFourLittleEndianBytesInt(); // 4 + this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(); // 8 + this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(); // 8 + this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(); // 8 + this.header_mimeListPos = reader.readEightLittleEndianBytesLong(); // 8 + this.header_mainPage = reader.readFourLittleEndianBytesInt(); // 4 + this.header_layoutPage = reader.readFourLittleEndianBytesInt(); // 4 + this.header_checksumPos = reader.readEightLittleEndianBytesLong(); // 8 [FIX!] // Initialise the MIMETypeList int len = 0; StringBuffer mimeBuffer = null; this.mimeList = new ArrayList<>(); while (true) { - reader.read(buffer, 0, 1); // read only one byte to check if this is a zero + int b = reader.read(); // read only one byte to check if this is a zero len = 0; mimeBuffer = new StringBuffer(); - while (buffer[0] != '\0') { - mimeBuffer.append((char) buffer[0]); - reader.read(buffer, 0, 1); + while (b != '\0') { + mimeBuffer.append((char) b); + b = reader.read(); len++; } if (len == 0) { diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index 49d25c50c..baaf784bf 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -115,8 +115,6 @@ public class ZIMReader { int i = 0, mimeType; - final byte[] buffer = new byte[8]; - // The list that will eventually return the list of URL's final ArrayList returnList = new ArrayList<>(); @@ -126,7 +124,7 @@ public class ZIMReader { for (i = 0; i < this.mFile.header_entryCount; i++) { // The position of URL i - long pos = this.mReader.readEightLittleEndianBytesLong(buffer); + long pos = this.mReader.readEightLittleEndianBytesLong(); // Mark the current position that we need to return to this.mReader.mark(); @@ -135,7 +133,7 @@ public class ZIMReader { this.mReader.seek(pos); // Article or Redirect entry? - mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer); + mimeType = this.mReader.readTwoLittleEndianBytesInt(); if (mimeType == 65535) { this.mReader.seek(pos + 12); @@ -156,8 +154,6 @@ public class ZIMReader { int i = 0, mimeType, articleNumber; - final byte[] buffer = new byte[8]; - // The list that will eventually return the list of URL's final ArrayList returnList = new ArrayList<>(); @@ -170,7 +166,7 @@ public class ZIMReader { for (i = 0; i < this.mFile.header_entryCount; i++) { // The articleNumber of the position of URL i - articleNumber = this.mReader.readFourLittleEndianBytesInt(buffer); + articleNumber = this.mReader.readFourLittleEndianBytesInt(); // Mark the current position that we need to return to this.mReader.mark(); @@ -178,11 +174,11 @@ public class ZIMReader { this.mReader.seek(urlPtrPos + (8L * (articleNumber))); // The position of URL i - long pos = this.mReader.readEightLittleEndianBytesLong(buffer); + long pos = this.mReader.readEightLittleEndianBytesLong(); this.mReader.seek(pos); // Article or Redirect entry? - mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer); + mimeType = this.mReader.readTwoLittleEndianBytesInt(); if (mimeType == 65535) { this.mReader.seek(pos + 12); @@ -204,43 +200,40 @@ public class ZIMReader { // position must be the seek position for the title in the Title Pointer List private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException { - // Helpers - final byte[] buffer = new byte[8]; - // At the appropriate position in the titlePtrPos this.mReader.seek(position); // Get value of article at index - int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(buffer); + int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(); // Move to the position in urlPtrPos this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer); // Get value of article in urlPtrPos - long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(buffer); + long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(); // Go to the location of the directory entry this.mReader.seek(pointer_to_the_directory_entry); // read the Content Entry - final int type = this.mReader.readTwoLittleEndianBytesInt(buffer); // 2, 0xffff for redirect - this.mReader.read(); // 1, ignore, parameter length not used - final char namespace = (char) this.mReader.read(); // 1 - this.mReader.readFourLittleEndianBytesInt(buffer); // 4, ignore, revision not used + final int type = this.mReader.readTwoLittleEndianBytesInt(); // 2, 0xffff for redirect + this.mReader.read(); // 1, ignore, parameter length not used + final char namespace = (char) this.mReader.read(); // 1 + this.mReader.readFourLittleEndianBytesInt(); // 4, ignore, revision not used // Article or Redirect entry if (type == 65535) { - final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(buffer); + final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(); final String url = this.mReader.readZeroTerminatedString(); String title = this.mReader.readZeroTerminatedString(); title = title.equals("") ? url : title; return new RedirectEntry(type, namespace, redirectIndex, url, title, (position - this.mFile.header_urlPtrPos) / 8); } else { - final int cluster_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4 - final int blob_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4 - final String url = this.mReader.readZeroTerminatedString(); // zero terminated - String title = this.mReader.readZeroTerminatedString(); // zero terminated + final int cluster_number = this.mReader.readFourLittleEndianBytesInt(); // 4 + final int blob_number = this.mReader.readFourLittleEndianBytesInt(); // 4 + final String url = this.mReader.readZeroTerminatedString(); // zero terminated + String title = this.mReader.readZeroTerminatedString(); // zero terminated title = title.equals("") ? url : title; return new ArticleEntry( @@ -288,7 +281,7 @@ public class ZIMReader { return null; } - public ByteArrayOutputStream getArticleData(final DirectoryEntry directoryInfo) throws IOException { + public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException { // fail fast if (directoryInfo == null) return null; @@ -301,8 +294,7 @@ public class ZIMReader { this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8); // Read the location of the cluster - byte[] buffer = new byte[8]; - final long clusterPos = this.mReader.readEightLittleEndianBytesLong(buffer); + final long clusterPos = this.mReader.readEightLittleEndianBytesLong(); // Move to the cluster this.mReader.seek(clusterPos); @@ -311,32 +303,13 @@ public class ZIMReader { final int compressionType = this.mReader.read(); // Reference declaration - SingleXZInputStream xzReader = null; - int firstOffset, numberOfBlobs, offset1, - offset2, - location, - differenceOffset; - - ByteArrayOutputStream baos; + int firstOffset, numberOfBlobs, offset1, offset2, location, differenceOffset; // Check the compression type that was read - switch (compressionType) { - - // TODO: Read uncompressed data directly - case 0: - case 1: - - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - - // Read the first offset - this.mReader.read(buffer); + if (compressionType == 1) { // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities.toFourLittleEndianInteger(buffer); + firstOffset = this.mReader.readFourLittleEndianBytesInt(); // The number of blobs numberOfBlobs = firstOffset / 4; @@ -348,38 +321,31 @@ public class ZIMReader { offset1 = firstOffset; } else { location = (article.blob_number - 1) * 4; - Utilities.skipFully(this.mReader, location); - this.mReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); + RandomAcessFileZIMInputStream.skipFully(this.mReader, location); + offset1 = this.mReader.readFourLittleEndianBytesInt(); } - this.mReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); + offset2 = this.mReader.readFourLittleEndianBytesInt(); differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - Utilities.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2))); - this.mReader.read(buffer, 0, differenceOffset); - baos = new ByteArrayOutputStream(); - baos.write(buffer, 0, differenceOffset); - - return baos; + byte[] entry = new byte[differenceOffset]; + RandomAcessFileZIMInputStream.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2))); + this.mReader.read(entry, 0, differenceOffset); + return entry; + } // 2 for zlib and 3 for bzip2 (removed) // LZMA2 compressed data - case 4: + if (compressionType == 4) { // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; + byte[] buffer = new byte[4]; // Create a dictionary with size 40MiB, the zimlib uses this size while creating - xzReader = new SingleXZInputStream(this.mReader, 4194304); - - // Read the first offset - xzReader.read(buffer); + SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 4194304); // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities.toFourLittleEndianInteger(buffer); + firstOffset = this.mReader.readFourLittleEndianBytesInt(); // The number of blobs numberOfBlobs = firstOffset / 4; @@ -391,25 +357,23 @@ public class ZIMReader { offset1 = firstOffset; } else { location = (article.blob_number - 1) * 4; - Utilities.skipFully(xzReader, location); + RandomAcessFileZIMInputStream.skipFully(xzReader, location); xzReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); + offset1 = RandomAcessFileZIMInputStream.toFourLittleEndianInteger(buffer); } xzReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); + offset2 = RandomAcessFileZIMInputStream.toFourLittleEndianInteger(buffer); differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - Utilities.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2))); - xzReader.read(buffer, 0, differenceOffset); - baos = new ByteArrayOutputStream(); - baos.write(buffer, 0, differenceOffset); - return baos; + byte[] entry = new byte[differenceOffset]; + RandomAcessFileZIMInputStream.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2))); + xzReader.read(entry, 0, differenceOffset); - // case 5: zstd compressed (missing!) - default: - return null; + return entry; } + + // case 5: zstd compressed (missing!) + return null; } } diff --git a/source/org/openzim/ZIMTest.java b/source/org/openzim/ZIMTest.java index 24b9cf9be..ea77c3b90 100644 --- a/source/org/openzim/ZIMTest.java +++ b/source/org/openzim/ZIMTest.java @@ -18,7 +18,6 @@ package org.openzim; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.List; @@ -55,8 +54,8 @@ public class ZIMTest { // print article c-1 DirectoryEntry directory_entry = zReader.getDirectoryInfo(c - 1); - ByteArrayOutputStream articleStream = zReader.getArticleData(directory_entry); - String article = articleStream == null ? "NULL" : articleStream.toString(StandardCharsets.UTF_8.name()); + byte[] articleBytes = zReader.getArticleData(directory_entry); + String article = articleBytes == null ? "NULL" : new String(articleBytes, StandardCharsets.UTF_8); System.out.println(article); } catch (final IOException e) { e.printStackTrace();