added Zstandard compressed data decompression for ZIM files type 5

also: more generalization and performance enhancements
pull/607/head
Michael Peter Christen 1 year ago
parent ad8ee3a0b6
commit b0ae660790

@ -98,5 +98,6 @@
<dependency org="org.hamcrest" name="hamcrest" rev="2.2" conf="test->default"/> <dependency org="org.hamcrest" name="hamcrest" rev="2.2" conf="test->default"/>
<dependency org="org.hamcrest" name="hamcrest-core" rev="2.2" conf="test->default"/> <dependency org="org.hamcrest" name="hamcrest-core" rev="2.2" conf="test->default"/>
<dependency org="org.hamcrest" name="hamcrest-library" rev="2.2" conf="test->default"/> <dependency org="org.hamcrest" name="hamcrest-library" rev="2.2" conf="test->default"/>
<dependency org="com.github.luben" name="zstd-jni" rev="1.5.5-6"></dependency>
</dependencies> </dependencies>
</ivy-module> </ivy-module>

@ -18,6 +18,7 @@
package org.openzim; package org.openzim;
import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
@ -70,7 +71,7 @@ public class RandomAccessFileZIMInputStream extends InputStream {
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)); | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
} }
private static long toEightLittleEndianLong(final byte[] buffer) { public static long toEightLittleEndianLong(final byte[] buffer) {
return // cast to long required otherwise this is again an integer return // cast to long required otherwise this is again an integer
((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8) ((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8)
| ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24) | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24)
@ -85,13 +86,12 @@ public class RandomAccessFileZIMInputStream extends InputStream {
// Reads characters from the current position into a String and stops when a // Reads characters from the current position into a String and stops when a
// '\0' is encountered // '\0' is encountered
public String readZeroTerminatedString() throws IOException { public String readZeroTerminatedString() throws IOException {
final StringBuilder sb = new StringBuilder(); ByteArrayOutputStream buffer = new ByteArrayOutputStream();
int b = this.mRAFReader.read(); int b;
while (b != '\0') { while ((b = this.mRAFReader.read()) != '\0' && b != -1) {
sb.append((char) b); buffer.write(b);
b = this.mRAFReader.read();
} }
return sb.toString(); return buffer.toString("UTF-8");
} }
@Override @Override

@ -18,14 +18,15 @@
package org.openzim; package org.openzim;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.tukaani.xz.SingleXZInputStream; import org.tukaani.xz.SingleXZInputStream;
import com.github.luben.zstd.ZstdInputStream;
/** /**
* @author Arunesh Mathur * @author Arunesh Mathur
@ -36,7 +37,9 @@ import org.tukaani.xz.SingleXZInputStream;
* naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format, * naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
* change of Exception handling, * change of Exception handling,
* extension to more attributes as defined in spec (bugfix for mime type loading) * extension to more attributes as defined in spec (bugfix for mime type loading)
* bugfix to long parsing (prevented reading of large files) * bugfix to long parsing (prevented reading of large files),
* added extended cluster size parsing
* added ZStandard compression parsing (cluster type 5)
*/ */
public class ZIMReader { public class ZIMReader {
@ -110,91 +113,53 @@ public class ZIMReader {
return this.mFile; return this.mFile;
} }
// get a URL list that is sorted by the urls public String getURLByURLOrder(int entryNumber) throws IOException {
public List<String> getURLListByURL() throws IOException {
int i = 0, mimeType;
// The list that will eventually return the list of URL's
final ArrayList<String> returnList = new ArrayList<>();
// Move to the spot where URL's are listed // Move to the spot where URL's are listed
this.mReader.seek(this.mFile.header_urlPtrPos); this.mReader.seek(this.mFile.header_urlPtrPos + 8L * entryNumber);
for (i = 0; i < this.mFile.header_entryCount; i++) {
// The position of URL i
long pos = this.mReader.readEightLittleEndianBytesLong();
// Mark the current position that we need to return to
this.mReader.mark();
// Move to the position of URL i // The position of URL i
this.mReader.seek(pos); long pos = this.mReader.readEightLittleEndianBytesLong();
// Article or Redirect entry? // Move to the position of URL i
mimeType = this.mReader.readTwoLittleEndianBytesInt(); this.mReader.seek(pos);
if (mimeType == 65535) { // Article or Redirect entry?
this.mReader.seek(pos + 12); int mimeType = this.mReader.readTwoLittleEndianBytesInt();
returnList.add(this.mReader.readZeroTerminatedString());
} else {
this.mReader.seek(pos + 16);
returnList.add(this.mReader.readZeroTerminatedString());
}
this.mReader.reset(); if (mimeType == 65535) {
this.mReader.seek(pos + 12);
return this.mReader.readZeroTerminatedString();
} else {
this.mReader.seek(pos + 16);
return this.mReader.readZeroTerminatedString();
} }
return returnList;
} }
// get a URL list that is sorted by the entry titles public String getURLByTitleOrder(int entryNumber) throws IOException {
public List<String> getURLListByTitle() throws IOException {
int i = 0, mimeType, articleNumber;
// The list that will eventually return the list of URL's
final ArrayList<String> returnList = new ArrayList<>();
// Get the UrlPtrPos or one time storage
long urlPtrPos = this.mFile.header_urlPtrPos;
// Move to the spot where URL's are listed // Move to the spot where URL's are listed
this.mReader.seek(this.mFile.header_titlePtrPos); this.mReader.seek(this.mFile.header_titlePtrPos + 8L * entryNumber);
for (i = 0; i < this.mFile.header_entryCount; i++) {
// The articleNumber of the position of URL i // The articleNumber of the position of URL i
articleNumber = this.mReader.readFourLittleEndianBytesInt(); int articleNumber = this.mReader.readFourLittleEndianBytesInt();
// Mark the current position that we need to return to this.mReader.seek(this.mFile.header_urlPtrPos + (8L * (articleNumber)));
this.mReader.mark();
this.mReader.seek(urlPtrPos + (8L * (articleNumber))); // The position of URL i
long pos = this.mReader.readEightLittleEndianBytesLong();
this.mReader.seek(pos);
// The position of URL i // Article or Redirect entry?
long pos = this.mReader.readEightLittleEndianBytesLong(); int mimeType = this.mReader.readTwoLittleEndianBytesInt();
this.mReader.seek(pos);
// Article or Redirect entry? if (mimeType == 65535) {
mimeType = this.mReader.readTwoLittleEndianBytesInt(); this.mReader.seek(pos + 12);
return this.mReader.readZeroTerminatedString();
if (mimeType == 65535) { } else {
this.mReader.seek(pos + 12); this.mReader.seek(pos + 16);
final String url = this.mReader.readZeroTerminatedString(); return this.mReader.readZeroTerminatedString();
returnList.add(url);
} else {
this.mReader.seek(pos + 16);
final String url = this.mReader.readZeroTerminatedString();
returnList.add(url);
}
// Return to the marked position
this.mReader.reset();
} }
return returnList;
} }
// position must be the seek position for the title in the Title Pointer List // position must be the seek position for the title in the Title Pointer List
@ -291,7 +256,7 @@ public class ZIMReader {
final ArticleEntry article = (ArticleEntry) directoryInfo; final ArticleEntry article = (ArticleEntry) directoryInfo;
// Move to the cluster entry in the clusterPtrPos // Move to the cluster entry in the clusterPtrPos
this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8); this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8L);
// Read the location of the cluster // Read the location of the cluster
final long clusterPos = this.mReader.readEightLittleEndianBytesLong(); final long clusterPos = this.mReader.readEightLittleEndianBytesLong();
@ -302,78 +267,74 @@ public class ZIMReader {
// Read the first byte, for compression information // Read the first byte, for compression information
final int compressionType = this.mReader.read(); final int compressionType = this.mReader.read();
// Reference declaration
int firstOffset, numberOfBlobs, offset1, offset2, location, differenceOffset;
// Check the compression type that was read // Check the compression type that was read
if (compressionType == 1) { // type = 1 uncompressed
if (compressionType <= 1 || compressionType == 8 || compressionType == 9) {
// The first four bytes are the offset of the zeroth blob boolean extended = compressionType > 1;
firstOffset = this.mReader.readFourLittleEndianBytesInt(); return readClusterEntry(this.mReader, article.blob_number, extended);
// The number of blobs
numberOfBlobs = firstOffset / 4;
// The blobNumber has to be lesser than the numberOfBlobs
assert article.blob_number < numberOfBlobs;
if (article.blob_number == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (article.blob_number - 1) * 4;
RandomAccessFileZIMInputStream.skipFully(this.mReader, location);
offset1 = this.mReader.readFourLittleEndianBytesInt();
}
offset2 = this.mReader.readFourLittleEndianBytesInt();
differenceOffset = offset2 - offset1;
byte[] entry = new byte[differenceOffset];
RandomAccessFileZIMInputStream.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
this.mReader.read(entry, 0, differenceOffset);
return entry;
} }
// 2 for zlib and 3 for bzip2 (removed) // 2 for zlib and 3 for bzip2 (removed)
// LZMA2 compressed data // LZMA2 compressed data
if (compressionType == 4) { if (compressionType == 4 || compressionType == 12) {
boolean extended = compressionType == 12;
// Create a dictionary with size 40MiB, the zimlib uses this size while creating
SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 41943040);
return readClusterEntry(xzReader, article.blob_number, extended);
}
// Read the first 4 bytes to find out the number of artciles // Zstandard compressed data
byte[] buffer = new byte[4]; if (compressionType == 5 || compressionType == 13) {
boolean extended = compressionType == 13;
ZstdInputStream zReader = new ZstdInputStream(this.mReader);
return readClusterEntry(zReader, article.blob_number, extended);
}
// Create a dictionary with size 40MiB, the zimlib uses this size while creating return null;
SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 4194304); }
// The first four bytes are the offset of the zeroth blob private static byte[] readClusterEntry(InputStream is, int blob_number, boolean extended) throws IOException {
firstOffset = this.mReader.readFourLittleEndianBytesInt();
// The number of blobs // Read the first 4(8) bytes to find out the number of articles
numberOfBlobs = firstOffset / 4; byte[] buffer = new byte[extended ? 8 : 4];
// The blobNumber has to be lesser than the numberOfBlobs // The first four (eight) bytes are the offset of the zeroth blob
assert article.blob_number < numberOfBlobs; is.read(buffer);
if (article.blob_number == 0) { long firstOffset = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (article.blob_number - 1) * 4;
RandomAccessFileZIMInputStream.skipFully(xzReader, location);
xzReader.read(buffer);
offset1 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
}
xzReader.read(buffer); // The number of blobs can be computed by the offset
offset2 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); // the actual number is one less because there is one more offset entry than the actual number
differenceOffset = offset2 - offset1; // to identify the end of the last blob.
byte[] entry = new byte[differenceOffset]; long numberOfBlobs1 = extended ? firstOffset / 8 : firstOffset / 4;
RandomAccessFileZIMInputStream.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
xzReader.read(entry, 0, differenceOffset);
return entry; // The blobNumber has to be lesser than the numberOfBlobs - 1
// the blob numbers start with 0 even if the documentation states it is "the first blob".
assert blob_number < numberOfBlobs1 - 1;
long offset1;
if (blob_number == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
// skip one less than required to get to the offset entry because the first entry is already read
RandomAccessFileZIMInputStream.skipFully(is, (blob_number - 1) * (extended ? 8 : 4));
is.read(buffer);
offset1 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
} }
is.read(buffer);
// case 5: zstd compressed (missing!) long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
return null; long blob_size = offset2 - offset1;
byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT
// we must do two skip steps: first to the end of the offset list and second to the start of the blob
// - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset
// - the number of offset entries that we alreay read now is article.blob_number + 2 (in any case at least 2)
// - the remaining number of offset entries to skip is therefore numberOfBlobs1 - (article.blob_number + 2)
// - the addon skip of number of bytes to the start of the entry is offset1 - firstoffset with firstoffset = 4 * numberOfBlobs1
// - the full skip length is 4 * (numberOfBlobs1 - (article.blob_number + 2)) + offset1 - 4 * numberOfBlobs1
// = offset1 - 4 * (article.blob_number + 2)
RandomAccessFileZIMInputStream.skipFully(is, (offset1 - (extended ? 8 : 4) * (blob_number + 2)));
is.read(entry, 0, entry.length);
return entry;
} }
} }

@ -20,7 +20,6 @@ package org.openzim;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.List;
import org.openzim.ZIMReader.DirectoryEntry; import org.openzim.ZIMReader.DirectoryEntry;
@ -40,12 +39,10 @@ public class ZIMTest {
final ZIMReader zReader = new ZIMReader(file); final ZIMReader zReader = new ZIMReader(file);
// print a list of urls and titles // print a list of urls and titles
final List<String> urls = zReader.getURLListByURL(); int c = Math.min(10, file.header_entryCount);
final List<String> titles = zReader.getURLListByTitle();
int c = Math.min(10, titles.size());
for (int i = 0; i < c; i++) { for (int i = 0; i < c; i++) {
System.out.println("URL by URL " + i + ": " + urls.get(i)); System.out.println("URL by URL " + i + ": " + zReader.getURLByURLOrder(i));
System.out.println("URL by Title " + i + ": " + titles.get(i)); System.out.println("URL by Title " + i + ": " + zReader.getURLByTitleOrder(i));
DirectoryEntry entry = zReader.getDirectoryInfo(i); DirectoryEntry entry = zReader.getDirectoryInfo(i);
System.out.println("URL by Pos " + i + ": " + entry.url); System.out.println("URL by Pos " + i + ": " + entry.url);
System.out.println("Title by Pos " + i + ": " + entry.title); System.out.println("Title by Pos " + i + ": " + entry.title);

Loading…
Cancel
Save