refactoring of ZIM reader, simplification, removed unnecessary code

pull/607/head
Michael Peter Christen 1 year ago
parent c2b6b6e7b9
commit c4082c4ff2

@ -29,78 +29,69 @@ import java.io.RandomAccessFile;
*
* @author Arunesh Mathur <aruneshmathur1990 at gmail.com>
* @author Michael Christen
* bugfix to long parsing (return value was int)
* bugfix to long parsing (return value was int),
* moved conditions for exceptions to asserts,
* refactoring and merge with Utilities
*/
public class RandomAcessFileZIMInputStream extends InputStream {
private final RandomAccessFile mRAFReader;
private long mMarked = -1;
private final byte[] buffer2 = new byte[2];
private final byte[] buffer4 = new byte[4];
private final byte[] buffer8 = new byte[8];
public RandomAcessFileZIMInputStream(final RandomAccessFile reader) {
this.mRAFReader = reader;
}
// TODO: Remove the parameter buffer
public int readTwoLittleEndianBytesInt(final byte[] buffer) throws IOException {
if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 2);
return Utilities.toTwoLittleEndianInteger(buffer);
}
public int readTwoLittleEndianBytesInt() throws IOException {
this.mRAFReader.read(buffer2, 0, 2);
return toTwoLittleEndianInteger(buffer2);
}
// TODO: Remove the parameter buffer
public int readFourLittleEndianBytesInt(final byte[] buffer) throws IOException {
if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 4);
return Utilities.toFourLittleEndianInteger(buffer);
}
public int readFourLittleEndianBytesInt() throws IOException {
this.mRAFReader.read(buffer4, 0, 4);
return toFourLittleEndianInteger(buffer4);
}
// TODO: Remove the parameter buffer
public long readEightLittleEndianBytesLong(final byte[] buffer)
throws IOException {
if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 8);
return Utilities.toEightLittleEndianLong(buffer);
}
public long readEightLittleEndianBytesLong() throws IOException {
this.mRAFReader.read(buffer8, 0, 8);
return toEightLittleEndianLong(buffer8);
}
// TODO: Remove the parameter buffer
public long readSixteenLittleEndianBytesLong(final byte[] buffer)
throws IOException {
if (buffer.length < 16) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 16);
return Utilities.toSixteenLittleEndianLong(buffer);
}
private static int toTwoLittleEndianInteger(final byte[] buffer) {
return ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
}
public static int toFourLittleEndianInteger(final byte[] buffer) { // TODO: make private
return
((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
}
private static long toEightLittleEndianLong(final byte[] buffer) {
return // cast to long required otherwise this is again an integer
((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8)
| ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24)
| ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40)
| ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56));
}
public static void skipFully(final InputStream stream, final long bytes) throws IOException {
for (long i = stream.skip(bytes); i < bytes; i += stream.skip(bytes - i));
}
// Reads characters from the current position into a String and stops when a
// '\0' is encountered
public String readZeroTerminatedString() throws IOException {
final StringBuffer sb = new StringBuffer();
/*
* int i; byte[] buffer = new byte[100]; while (true) {
* mRAFReader.read(buffer); for (i = 0; i < buffer.length; i++) { if
* (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i
* != buffer.length) break; } return sb.toString();
*/
final StringBuilder sb = new StringBuilder();
int b = this.mRAFReader.read();
while (b != '\0') {
sb.append((char) b);
b = this.mRAFReader.read();
}
return sb.toString();
}
@Override

@ -1,76 +0,0 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openzim;
import java.io.IOException;
import java.io.InputStream;
/**
* @author Arunesh Mathur
* A ZIM file implementation that stores the Header and the MIMETypeList
*
* @author Michael Christen
* int/long bugfix (did reading of long values with int variables, causing negative offsets)
*/
public class Utilities {
public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result =
((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
return result;
}
}
public static int toFourLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result =
((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
return result;
}
}
public static long toEightLittleEndianLong(final byte[] buffer) throws IOException {
if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small");
} else {
final long result = // cast to long required otherwise this is again an integer
((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8)
| ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24)
| ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40)
| ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56));
return result;
}
}
public static long toSixteenLittleEndianLong(final byte[] buffer) throws IOException {
return toEightLittleEndianLong(buffer); // there are no sixten bytes long values
}
public static void skipFully(final InputStream stream, final long bytes) throws IOException {
for (long i = stream.skip(bytes); i < bytes; i += stream.skip(bytes - i));
}
}

@ -44,7 +44,6 @@ public class ZIMFile extends File {
public final int header_magicNumber;
public final int header_majorVersion;
public final int header_minorVersion;
public final long header_uuid;
public final int header_entryCount;
public final int header_clusterCount;
public final long header_urlPtrPos;
@ -69,34 +68,33 @@ public class ZIMFile extends File {
// The reader that will be used to read contents from the file
final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this, "r"));
final byte[] buffer = new byte[16];
// Read the contents of the header
this.header_magicNumber = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_majorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 2
this.header_minorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 4
this.header_uuid = reader.readSixteenLittleEndianBytesLong(buffer); // 16
this.header_entryCount = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_clusterCount = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.header_mimeListPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.header_mainPage = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_layoutPage = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_checksumPos = reader.readEightLittleEndianBytesLong(buffer); // 8 [FIX!]
this.header_magicNumber = reader.readFourLittleEndianBytesInt(); // 4
this.header_majorVersion = reader.readTwoLittleEndianBytesInt(); // 2
this.header_minorVersion = reader.readTwoLittleEndianBytesInt(); // 4
RandomAcessFileZIMInputStream.skipFully(reader, 16); // skip the uuid, this is not used
this.header_entryCount = reader.readFourLittleEndianBytesInt(); // 4
this.header_clusterCount = reader.readFourLittleEndianBytesInt(); // 4
this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(); // 8
this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(); // 8
this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(); // 8
this.header_mimeListPos = reader.readEightLittleEndianBytesLong(); // 8
this.header_mainPage = reader.readFourLittleEndianBytesInt(); // 4
this.header_layoutPage = reader.readFourLittleEndianBytesInt(); // 4
this.header_checksumPos = reader.readEightLittleEndianBytesLong(); // 8 [FIX!]
// Initialise the MIMETypeList
int len = 0;
StringBuffer mimeBuffer = null;
this.mimeList = new ArrayList<>();
while (true) {
reader.read(buffer, 0, 1); // read only one byte to check if this is a zero
int b = reader.read(); // read only one byte to check if this is a zero
len = 0;
mimeBuffer = new StringBuffer();
while (buffer[0] != '\0') {
mimeBuffer.append((char) buffer[0]);
reader.read(buffer, 0, 1);
while (b != '\0') {
mimeBuffer.append((char) b);
b = reader.read();
len++;
}
if (len == 0) {

@ -115,8 +115,6 @@ public class ZIMReader {
int i = 0, mimeType;
final byte[] buffer = new byte[8];
// The list that will eventually return the list of URL's
final ArrayList<String> returnList = new ArrayList<>();
@ -126,7 +124,7 @@ public class ZIMReader {
for (i = 0; i < this.mFile.header_entryCount; i++) {
// The position of URL i
long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
long pos = this.mReader.readEightLittleEndianBytesLong();
// Mark the current position that we need to return to
this.mReader.mark();
@ -135,7 +133,7 @@ public class ZIMReader {
this.mReader.seek(pos);
// Article or Redirect entry?
mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
mimeType = this.mReader.readTwoLittleEndianBytesInt();
if (mimeType == 65535) {
this.mReader.seek(pos + 12);
@ -156,8 +154,6 @@ public class ZIMReader {
int i = 0, mimeType, articleNumber;
final byte[] buffer = new byte[8];
// The list that will eventually return the list of URL's
final ArrayList<String> returnList = new ArrayList<>();
@ -170,7 +166,7 @@ public class ZIMReader {
for (i = 0; i < this.mFile.header_entryCount; i++) {
// The articleNumber of the position of URL i
articleNumber = this.mReader.readFourLittleEndianBytesInt(buffer);
articleNumber = this.mReader.readFourLittleEndianBytesInt();
// Mark the current position that we need to return to
this.mReader.mark();
@ -178,11 +174,11 @@ public class ZIMReader {
this.mReader.seek(urlPtrPos + (8L * (articleNumber)));
// The position of URL i
long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
long pos = this.mReader.readEightLittleEndianBytesLong();
this.mReader.seek(pos);
// Article or Redirect entry?
mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
mimeType = this.mReader.readTwoLittleEndianBytesInt();
if (mimeType == 65535) {
this.mReader.seek(pos + 12);
@ -204,43 +200,40 @@ public class ZIMReader {
// position must be the seek position for the title in the Title Pointer List
private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException {
// Helpers
final byte[] buffer = new byte[8];
// At the appropriate position in the titlePtrPos
this.mReader.seek(position);
// Get value of article at index
int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(buffer);
int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt();
// Move to the position in urlPtrPos
this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer);
// Get value of article in urlPtrPos
long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(buffer);
long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong();
// Go to the location of the directory entry
this.mReader.seek(pointer_to_the_directory_entry);
// read the Content Entry
final int type = this.mReader.readTwoLittleEndianBytesInt(buffer); // 2, 0xffff for redirect
this.mReader.read(); // 1, ignore, parameter length not used
final char namespace = (char) this.mReader.read(); // 1
this.mReader.readFourLittleEndianBytesInt(buffer); // 4, ignore, revision not used
final int type = this.mReader.readTwoLittleEndianBytesInt(); // 2, 0xffff for redirect
this.mReader.read(); // 1, ignore, parameter length not used
final char namespace = (char) this.mReader.read(); // 1
this.mReader.readFourLittleEndianBytesInt(); // 4, ignore, revision not used
// Article or Redirect entry
if (type == 65535) {
final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(buffer);
final int redirectIndex = this.mReader.readFourLittleEndianBytesInt();
final String url = this.mReader.readZeroTerminatedString();
String title = this.mReader.readZeroTerminatedString();
title = title.equals("") ? url : title;
return new RedirectEntry(type, namespace, redirectIndex,
url, title, (position - this.mFile.header_urlPtrPos) / 8);
} else {
final int cluster_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4
final int blob_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4
final String url = this.mReader.readZeroTerminatedString(); // zero terminated
String title = this.mReader.readZeroTerminatedString(); // zero terminated
final int cluster_number = this.mReader.readFourLittleEndianBytesInt(); // 4
final int blob_number = this.mReader.readFourLittleEndianBytesInt(); // 4
final String url = this.mReader.readZeroTerminatedString(); // zero terminated
String title = this.mReader.readZeroTerminatedString(); // zero terminated
title = title.equals("") ? url : title;
return new ArticleEntry(
@ -288,7 +281,7 @@ public class ZIMReader {
return null;
}
public ByteArrayOutputStream getArticleData(final DirectoryEntry directoryInfo) throws IOException {
public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {
// fail fast
if (directoryInfo == null) return null;
@ -301,8 +294,7 @@ public class ZIMReader {
this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8);
// Read the location of the cluster
byte[] buffer = new byte[8];
final long clusterPos = this.mReader.readEightLittleEndianBytesLong(buffer);
final long clusterPos = this.mReader.readEightLittleEndianBytesLong();
// Move to the cluster
this.mReader.seek(clusterPos);
@ -311,32 +303,13 @@ public class ZIMReader {
final int compressionType = this.mReader.read();
// Reference declaration
SingleXZInputStream xzReader = null;
int firstOffset, numberOfBlobs, offset1,
offset2,
location,
differenceOffset;
ByteArrayOutputStream baos;
int firstOffset, numberOfBlobs, offset1, offset2, location, differenceOffset;
// Check the compression type that was read
switch (compressionType) {
// TODO: Read uncompressed data directly
case 0:
case 1:
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// Create a dictionary with size 40MiB, the zimlib uses this
// size while creating
// Read the first offset
this.mReader.read(buffer);
if (compressionType == 1) {
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities.toFourLittleEndianInteger(buffer);
firstOffset = this.mReader.readFourLittleEndianBytesInt();
// The number of blobs
numberOfBlobs = firstOffset / 4;
@ -348,38 +321,31 @@ public class ZIMReader {
offset1 = firstOffset;
} else {
location = (article.blob_number - 1) * 4;
Utilities.skipFully(this.mReader, location);
this.mReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
RandomAcessFileZIMInputStream.skipFully(this.mReader, location);
offset1 = this.mReader.readFourLittleEndianBytesInt();
}
this.mReader.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
offset2 = this.mReader.readFourLittleEndianBytesInt();
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
this.mReader.read(buffer, 0, differenceOffset);
baos = new ByteArrayOutputStream();
baos.write(buffer, 0, differenceOffset);
return baos;
byte[] entry = new byte[differenceOffset];
RandomAcessFileZIMInputStream.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
this.mReader.read(entry, 0, differenceOffset);
return entry;
}
// 2 for zlib and 3 for bzip2 (removed)
// LZMA2 compressed data
case 4:
if (compressionType == 4) {
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
byte[] buffer = new byte[4];
// Create a dictionary with size 40MiB, the zimlib uses this size while creating
xzReader = new SingleXZInputStream(this.mReader, 4194304);
// Read the first offset
xzReader.read(buffer);
SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 4194304);
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities.toFourLittleEndianInteger(buffer);
firstOffset = this.mReader.readFourLittleEndianBytesInt();
// The number of blobs
numberOfBlobs = firstOffset / 4;
@ -391,25 +357,23 @@ public class ZIMReader {
offset1 = firstOffset;
} else {
location = (article.blob_number - 1) * 4;
Utilities.skipFully(xzReader, location);
RandomAcessFileZIMInputStream.skipFully(xzReader, location);
xzReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
offset1 = RandomAcessFileZIMInputStream.toFourLittleEndianInteger(buffer);
}
xzReader.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
offset2 = RandomAcessFileZIMInputStream.toFourLittleEndianInteger(buffer);
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
xzReader.read(buffer, 0, differenceOffset);
baos = new ByteArrayOutputStream();
baos.write(buffer, 0, differenceOffset);
return baos;
byte[] entry = new byte[differenceOffset];
RandomAcessFileZIMInputStream.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
xzReader.read(entry, 0, differenceOffset);
// case 5: zstd compressed (missing!)
default:
return null;
return entry;
}
// case 5: zstd compressed (missing!)
return null;
}
}

@ -18,7 +18,6 @@
package org.openzim;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
@ -55,8 +54,8 @@ public class ZIMTest {
// print article c-1
DirectoryEntry directory_entry = zReader.getDirectoryInfo(c - 1);
ByteArrayOutputStream articleStream = zReader.getArticleData(directory_entry);
String article = articleStream == null ? "NULL" : articleStream.toString(StandardCharsets.UTF_8.name());
byte[] articleBytes = zReader.getArticleData(directory_entry);
String article = articleBytes == null ? "NULL" : new String(articleBytes, StandardCharsets.UTF_8);
System.out.println(article);
} catch (final IOException e) {
e.printStackTrace();

Loading…
Cancel
Save