Fixed a large number of problems in the ZIM reader.

This library was not prepared for large data because it was missing long
data types for pointers. I had to modify the code-base in a fundamental
way:
- Proof-Reading,
- unclustering,
- refactoring,
- naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
- change of Exception handling,
- extension to more attributes as defined in spec (bugfix for mime type
loading)
- bugfix to long parsing (prevented reading of large files)
The code is furthermore very inefficient and requires more attention.
However the format is very useful for YaCy as there are numerous data
sources for ZIM-Files.
pull/607/head
Michael Peter Christen 1 year ago
parent 5ba5fb5d23
commit c2b6b6e7b9

@ -1,46 +0,0 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openzim;
public class ArticleEntry extends DirectoryEntry {
int clusterNumber;
int blobnumber;
public ArticleEntry(final int mimeType, final char namespace, final int revision,
final int clusterNumber, final int blobNumber, final String url, final String title,
final int urlListindex) {
super(mimeType, namespace, revision, url, title, urlListindex);
this.clusterNumber = clusterNumber;
this.blobnumber = blobNumber;
}
public int getClusterNumber() {
return this.clusterNumber;
}
public int getBlobnumber() {
return this.blobnumber;
}
}

@ -1,69 +0,0 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openzim;
public abstract class DirectoryEntry {
int mimeType;
char namespace;
int revision;
String url;
String title;
int urlListindex;
public DirectoryEntry(final int mimeType, final char namespace, final int revision,
final String url, final String title, final int index) {
this.mimeType = mimeType;
this.namespace = namespace;
this.revision = revision;
this.url = url;
this.title = title;
this.urlListindex = index;
}
public int getMimeType() {
return this.mimeType;
}
public char getNamespace() {
return this.namespace;
}
public int getRevision() {
return this.revision;
}
public String getUrl() {
return this.url;
}
public String getTitle() {
return this.title;
}
public int getUrlListindex() {
return this.urlListindex;
}
}

@ -28,6 +28,8 @@ import java.io.RandomAccessFile;
* implementation, can be improved.
*
* @author Arunesh Mathur <aruneshmathur1990 at gmail.com>
* @author Michael Christen
* bugfix to long parsing (return value was int)
*/
public class RandomAcessFileZIMInputStream extends InputStream {
@ -41,7 +43,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
}
// TODO: Remove the parameter buffer
public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException {
public int readTwoLittleEndianBytesInt(final byte[] buffer) throws IOException {
if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small");
} else {
@ -51,7 +53,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
}
// TODO: Remove the parameter buffer
public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException {
public int readFourLittleEndianBytesInt(final byte[] buffer) throws IOException {
if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small");
} else {
@ -61,30 +63,30 @@ public class RandomAcessFileZIMInputStream extends InputStream {
}
// TODO: Remove the parameter buffer
public int readEightLittleEndianBytesValue(final byte[] buffer)
public long readEightLittleEndianBytesLong(final byte[] buffer)
throws IOException {
if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 8);
return Utilities.toEightLittleEndianInteger(buffer);
return Utilities.toEightLittleEndianLong(buffer);
}
}
// TODO: Remove the parameter buffer
public int readSixteenLittleEndianBytesValue(final byte[] buffer)
public long readSixteenLittleEndianBytesLong(final byte[] buffer)
throws IOException {
if (buffer.length < 16) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 16);
return Utilities.toSixteenLittleEndianInteger(buffer);
return Utilities.toSixteenLittleEndianLong(buffer);
}
}
// Reads characters from the current position into a String and stops when a
// '\0' is encountered
public String readString() throws IOException {
public String readZeroTerminatedString() throws IOException {
final StringBuffer sb = new StringBuffer();
/*
* int i; byte[] buffer = new byte[100]; while (true) {
@ -92,8 +94,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
* (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i
* != buffer.length) break; } return sb.toString();
*/
int b;
b = this.mRAFReader.read();
int b = this.mRAFReader.read();
while (b != '\0') {
sb.append((char) b);
b = this.mRAFReader.read();

@ -1,37 +0,0 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openzim;
public class RedirectEntry extends DirectoryEntry {
int redirectIndex;
public RedirectEntry(final int mimeType, final char namespace, final int revision,
final int redirectIndex, final String url, final String title, final int urlListindex) {
super(mimeType, namespace, revision, url, title, urlListindex);
this.redirectIndex = redirectIndex;
}
public int getRedirectIndex() {
return this.redirectIndex;
}
}

@ -22,18 +22,21 @@ package org.openzim;
import java.io.IOException;
import java.io.InputStream;
/**
* @author Arunesh Mathur
* A ZIM file implementation that stores the Header and the MIMETypeList
*
* @author Michael Christen
* int/long bugfix (did reading of long values with int variables, causing negative offsets)
*/
public class Utilities {
// TODO: Write a binary search algorithm
public static int binarySearch() {
return -1;
}
public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
final int result =
((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
return result;
}
}
@ -42,39 +45,28 @@ public class Utilities {
if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
final int result =
((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
return result;
}
}
public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException {
public static long toEightLittleEndianLong(final byte[] buffer) throws IOException {
if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56));
final long result = // cast to long required otherwise this is again an integer
((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8)
| ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24)
| ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40)
| ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56));
return result;
}
}
public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 16) {
throw new OutOfMemoryError("buffer too small");
} else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)
| ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72)
| ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88)
| ((buffer[12] & 0xFF) << 96)
| ((buffer[13] & 0xFF) << 104)
| ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120));
return result;
}
public static long toSixteenLittleEndianLong(final byte[] buffer) throws IOException {
return toEightLittleEndianLong(buffer); // there are no sixten bytes long values
}
public static void skipFully(final InputStream stream, final long bytes) throws IOException {

@ -20,46 +20,47 @@ package org.openzim;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.List;
/**
* @author Arunesh Mathur
*
* A ZIM file implementation that stores the Header and the MIMETypeList
*
* @author Michael Christen
* Proof-Reading, unclustering, refactoring,
* naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
* change of Exception handling,
* extension to more attributes as defined in spec (bugfix for mime type loading)
* int/long bugfix (did reading of long values with int variables, causing negative offsets)
*/
public class ZIMFile extends File {
/**
*
*/
private static final long serialVersionUID = 1L;
private Header mHeader;
private List<String> mMIMETypeList; // Can be removed if not needed
public ZIMFile(final String path) {
// Header values
public final int header_magicNumber;
public final int header_majorVersion;
public final int header_minorVersion;
public final long header_uuid;
public final int header_entryCount;
public final int header_clusterCount;
public final long header_urlPtrPos;
public final long header_titlePtrPos;
public final long header_clusterPtrPos;
public final long header_mimeListPos;
public final int header_mainPage;
public final int header_layoutPage;
public final long header_checksumPos;
// content cache
public final List<String> mimeList;
public ZIMFile(final String path) throws IOException {
super(path);
try {
readHeader();
} catch (final FileNotFoundException e) {
e.printStackTrace();
}
}
private void readHeader() throws FileNotFoundException {
// Helpers
int len = 0;
StringBuffer mimeBuffer = null;
// The byte[] that will help us in reading bytes out of the file
final byte[] buffer = new byte[16];
// Check whether the file exists
if (!(this.exists())) {
throw new FileNotFoundException(
@ -67,132 +68,45 @@ public class ZIMFile extends File {
}
// The reader that will be used to read contents from the file
final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(
new RandomAccessFile(this, "r"));
// The ZIM file header
this.mHeader = new Header();
final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this, "r"));
final byte[] buffer = new byte[16];
// Read the contents of the header
try {
this.mHeader.magicNumber = reader.readFourLittleEndianBytesValue(buffer);
// System.out.println(mHeader.magicNumber);
this.mHeader.version = reader.readFourLittleEndianBytesValue(buffer);
// System.out.println(mHeader.version);
this.mHeader.uuid = reader.readSixteenLittleEndianBytesValue(buffer);
// System.out.println(mHeader.uuid); reader.read(buffer, 0, 4);
this.mHeader.articleCount = reader
.readFourLittleEndianBytesValue(buffer);
// System.out.println(mHeader.articleCount);
this.mHeader.clusterCount = reader
.readFourLittleEndianBytesValue(buffer);
// System.out.println(mHeader.clusterCount);
this.mHeader.urlPtrPos = reader.readEightLittleEndianBytesValue(buffer);
// System.out.println(mHeader.urlPtrPos);
this.mHeader.titlePtrPos = reader
.readEightLittleEndianBytesValue(buffer);
// System.out.println(mHeader.titlePtrPos);
this.mHeader.clusterPtrPos = reader
.readEightLittleEndianBytesValue(buffer);
// System.out.println(mHeader.clusterPtrPos);
this.mHeader.mimeListPos = reader
.readEightLittleEndianBytesValue(buffer);
// System.out.println(mHeader.mimeListPos);
this.mHeader.mainPage = reader.readFourLittleEndianBytesValue(buffer);
// System.out.println(mHeader.mainPage);
this.mHeader.layoutPage = reader.readFourLittleEndianBytesValue(buffer);
// System.out.println(mHeader.layoutPage);
// Initialise the MIMETypeList
this.mMIMETypeList = new ArrayList<>();
while (true) {
this.header_magicNumber = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_majorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 2
this.header_minorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 4
this.header_uuid = reader.readSixteenLittleEndianBytesLong(buffer); // 16
this.header_entryCount = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_clusterCount = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.header_mimeListPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.header_mainPage = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_layoutPage = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_checksumPos = reader.readEightLittleEndianBytesLong(buffer); // 8 [FIX!]
// Initialise the MIMETypeList
int len = 0;
StringBuffer mimeBuffer = null;
this.mimeList = new ArrayList<>();
while (true) {
reader.read(buffer, 0, 1); // read only one byte to check if this is a zero
len = 0;
mimeBuffer = new StringBuffer();
while (buffer[0] != '\0') {
mimeBuffer.append((char) buffer[0]);
reader.read(buffer, 0, 1);
len = 0;
mimeBuffer = new StringBuffer();
while (buffer[0] != '\0') {
mimeBuffer.append((char) buffer[0]);
reader.read(buffer, 0, 1);
len++;
}
if (len == 0) {
break;
}
this.mMIMETypeList.add(mimeBuffer.toString());
// System.out.println(mimeBuffer);
len++;
}
} catch (final Exception e) {
e.printStackTrace();
if (len == 0) {
break;
}
String mimeType = mimeBuffer.toString();
System.out.println(mimeType);
this.mimeList.add(mimeType);
}
}
public int getVersion() {
return this.mHeader.version;
}
public int getUuid() {
return this.mHeader.uuid;
}
public int getArticleCount() {
return this.mHeader.articleCount;
}
public int getClusterCount() {
return this.mHeader.clusterCount;
}
public int getUrlPtrPos() {
return this.mHeader.urlPtrPos;
}
public int getTitlePtrPos() {
return this.mHeader.titlePtrPos;
}
public int getClusterPtrPos() {
return this.mHeader.clusterPtrPos;
}
public String getMIMEType(final int mimeNumber) {
return this.mMIMETypeList.get(mimeNumber);
}
public int getHeaderSize() {
return this.mHeader.mimeListPos;
}
public int getMainPage() {
return this.mHeader.mainPage;
}
public int getLayoutPage() {
return this.mHeader.layoutPage;
}
public class Header {
int magicNumber;
int version;
int uuid;
int articleCount;
int clusterCount;
int urlPtrPos;
int titlePtrPos;
int clusterPtrPos;
int mimeListPos;
int mainPage;
int layoutPage;
}
}

@ -29,28 +29,91 @@ import org.tukaani.xz.SingleXZInputStream;
/**
* @author Arunesh Mathur
*
* A ZIMReader that reads data from the ZIMFile
*
* @author Michael Christen
* Proof-Reading, unclustering, refactoring,
* naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
* change of Exception handling,
* extension to more attributes as defined in spec (bugfix for mime type loading)
* bugfix to long parsing (prevented reading of large files)
*/
public class ZIMReader {
private final ZIMFile mFile;
private RandomAcessFileZIMInputStream mReader;
public static abstract class DirectoryEntry {
public final int mimetype;
public final char namespace;
public final int cluster_number;
public final String url;
public final String title;
public final long urlListindex;
public DirectoryEntry(
final int mimeType, final char namespace,
final int cluster_number,
final String url, final String title,
final long index) {
this.mimetype = mimeType;
this.namespace = namespace;
this.cluster_number = cluster_number;
this.url = url;
this.title = title;
this.urlListindex = index;
}
}
public static class ArticleEntry extends DirectoryEntry {
public final int cluster_number;
public final int blob_number;
public ArticleEntry(
final int mimeType, final char namespace,
final int cluster_number, final int blob_number,
final String url, final String title,
final long urlListindex) {
super(mimeType, namespace, cluster_number, url, title, urlListindex);
this.cluster_number = cluster_number;
this.blob_number = blob_number;
}
}
public static class RedirectEntry extends DirectoryEntry {
public final long redirect_index;
public RedirectEntry(final int mimeType, final char namespace,
final long redirect_index, final String url, final String title,
final long urlListindex) {
super(mimeType, namespace, 0, url, title, urlListindex);
this.redirect_index = redirect_index;
}
}
public ZIMReader(final ZIMFile file) {
this.mFile = file;
try {
this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(
this.mFile, "r"));
this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this.mFile, "r"));
} catch (final FileNotFoundException e) {
e.printStackTrace();
}
}
public ZIMFile getZIMFile() {
return this.mFile;
}
// get a URL list that is sorted by the urls
public List<String> getURLListByURL() throws IOException {
int i = 0, pos, mimeType;
int i = 0, mimeType;
final byte[] buffer = new byte[8];
@ -58,12 +121,12 @@ public class ZIMReader {
final ArrayList<String> returnList = new ArrayList<>();
// Move to the spot where URL's are listed
this.mReader.seek(this.mFile.getUrlPtrPos());
this.mReader.seek(this.mFile.header_urlPtrPos);
for (i = 0; i < this.mFile.getArticleCount(); i++) {
for (i = 0; i < this.mFile.header_entryCount; i++) {
// The position of URL i
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
// Mark the current position that we need to return to
this.mReader.mark();
@ -72,14 +135,14 @@ public class ZIMReader {
this.mReader.seek(pos);
// Article or Redirect entry?
mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
if (mimeType == 65535) {
this.mReader.seek(pos + 12);
returnList.add(this.mReader.readString());
returnList.add(this.mReader.readZeroTerminatedString());
} else {
this.mReader.seek(pos + 16);
returnList.add(this.mReader.readString());
returnList.add(this.mReader.readZeroTerminatedString());
}
this.mReader.reset();
@ -88,9 +151,10 @@ public class ZIMReader {
return returnList;
}
// get a URL list that is sorted by the entry titles
public List<String> getURLListByTitle() throws IOException {
int i = 0, pos, mimeType, articleNumber, urlPtrPos;
int i = 0, mimeType, articleNumber;
final byte[] buffer = new byte[8];
@ -98,35 +162,35 @@ public class ZIMReader {
final ArrayList<String> returnList = new ArrayList<>();
// Get the UrlPtrPos or one time storage
urlPtrPos = this.mFile.getUrlPtrPos();
long urlPtrPos = this.mFile.header_urlPtrPos;
// Move to the spot where URL's are listed
this.mReader.seek(this.mFile.getTitlePtrPos());
this.mReader.seek(this.mFile.header_titlePtrPos);
for (i = 0; i < this.mFile.getArticleCount(); i++) {
for (i = 0; i < this.mFile.header_entryCount; i++) {
// The articleNumber of the position of URL i
articleNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
articleNumber = this.mReader.readFourLittleEndianBytesInt(buffer);
// Mark the current position that we need to return to
this.mReader.mark();
this.mReader.seek(urlPtrPos + (8 * (articleNumber)));
this.mReader.seek(urlPtrPos + (8L * (articleNumber)));
// The position of URL i
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
this.mReader.seek(pos);
// Article or Redirect entry?
mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
if (mimeType == 65535) {
this.mReader.seek(pos + 12);
final String url = this.mReader.readString();
final String url = this.mReader.readZeroTerminatedString();
returnList.add(url);
} else {
this.mReader.seek(pos + 16);
final String url = this.mReader.readString();
final String url = this.mReader.readZeroTerminatedString();
returnList.add(url);
}
@ -137,14 +201,69 @@ public class ZIMReader {
return returnList;
}
// position must be the seek position for the title in the Title Pointer List
private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException {
// Helpers
final byte[] buffer = new byte[8];
// At the appropriate position in the titlePtrPos
this.mReader.seek(position);
// Get value of article at index
int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(buffer);
// Move to the position in urlPtrPos
this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer);
// Get value of article in urlPtrPos
long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(buffer);
// Go to the location of the directory entry
this.mReader.seek(pointer_to_the_directory_entry);
// read the Content Entry
final int type = this.mReader.readTwoLittleEndianBytesInt(buffer); // 2, 0xffff for redirect
this.mReader.read(); // 1, ignore, parameter length not used
final char namespace = (char) this.mReader.read(); // 1
this.mReader.readFourLittleEndianBytesInt(buffer); // 4, ignore, revision not used
// Article or Redirect entry
if (type == 65535) {
final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(buffer);
final String url = this.mReader.readZeroTerminatedString();
String title = this.mReader.readZeroTerminatedString();
title = title.equals("") ? url : title;
return new RedirectEntry(type, namespace, redirectIndex,
url, title, (position - this.mFile.header_urlPtrPos) / 8);
} else {
final int cluster_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4
final int blob_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4
final String url = this.mReader.readZeroTerminatedString(); // zero terminated
String title = this.mReader.readZeroTerminatedString(); // zero terminated
title = title.equals("") ? url : title;
return new ArticleEntry(
type, namespace,
cluster_number, blob_number,
url, title, (position - this.mFile.header_urlPtrPos) / 8);
}
}
public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException {
if (entryNumber >= this.mFile.header_entryCount) throw new IOException("entryNumber exceeds entryCount");
return getDirectoryInfoAtTitlePosition(this.mFile.header_titlePtrPos + 4 * entryNumber);
}
// Gives the minimum required information needed for the given articleName
public DirectoryEntry getDirectoryInfo(String articleName, final char namespace)
throws IOException {
// This makes a binary search on the article name entry list.
public DirectoryEntry getDirectoryInfo(final char namespace, String articleName) throws IOException {
DirectoryEntry entry;
String cmpStr;
final int numberOfArticles = this.mFile.getArticleCount();
int beg = this.mFile.getTitlePtrPos(), end = beg + (numberOfArticles * 4), mid;
final int numberOfArticles = this.mFile.header_entryCount;
long beg = this.mFile.header_titlePtrPos, end = beg + (numberOfArticles * 4), mid;
articleName = namespace + "/" + articleName;
@ -154,7 +273,7 @@ public class ZIMReader {
if (entry == null) {
return null;
}
cmpStr = entry.getNamespace() + "/" + entry.getUrl();
cmpStr = entry.namespace + "/" + entry.url;
if (articleName.compareTo(cmpStr) < 0) {
end = mid - 4;
@ -167,242 +286,130 @@ public class ZIMReader {
}
return null;
}
public ByteArrayOutputStream getArticleData(final String articleName, final char namespace) throws IOException {
// search in the cache first, if not found, then call getDirectoryInfo(articleName)
byte[] buffer = new byte[8];
final DirectoryEntry mainEntry = getDirectoryInfo(articleName, namespace);
if (mainEntry != null) {
// Check what kind of an entry was mainEnrty
if (mainEntry.getClass() == ArticleEntry.class) {
// Cast to ArticleEntry
final ArticleEntry article = (ArticleEntry) mainEntry;
// Get the cluster and blob numbers from the article
final int clusterNumber = article.getClusterNumber();
final int blobNumber = article.getBlobnumber();
// Move to the cluster entry in the clusterPtrPos
this.mReader.seek(this.mFile.getClusterPtrPos() + clusterNumber * 8);
// Read the location of the cluster
final int clusterPos = this.mReader
.readEightLittleEndianBytesValue(buffer);
// Move to the cluster
this.mReader.seek(clusterPos);
public ByteArrayOutputStream getArticleData(final DirectoryEntry directoryInfo) throws IOException {
// Read the first byte, for compression information
final int compressionType = this.mReader.read();
// fail fast
if (directoryInfo == null) return null;
if (directoryInfo.getClass() != ArticleEntry.class) return null;
// Reference declaration
SingleXZInputStream xzReader = null;
int firstOffset, numberOfBlobs, offset1,
offset2,
location,
differenceOffset;
// This is now an article, so thus we can cast to ArticleEntry
final ArticleEntry article = (ArticleEntry) directoryInfo;
ByteArrayOutputStream baos;
// Move to the cluster entry in the clusterPtrPos
this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8);
// Check the compression type that was read
switch (compressionType) {
// TODO: Read uncompressed data directly
case 0:
case 1:
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// Create a dictionary with size 40MiB, the zimlib uses this
// size while creating
// Read the first offset
this.mReader.read(buffer);
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities
.toFourLittleEndianInteger(buffer);
// The number of blobs
numberOfBlobs = firstOffset / 4;
// The blobNumber has to be lesser than the numberOfBlobs
assert blobNumber < numberOfBlobs;
if (blobNumber == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (blobNumber - 1) * 4;
Utilities.skipFully(this.mReader, location);
this.mReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
this.mReader.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(this.mReader,
(offset1 - 4 * (blobNumber + 2)));
this.mReader.read(buffer, 0, differenceOffset);
baos = new ByteArrayOutputStream();
baos.write(buffer, 0, differenceOffset);
return baos;
// LZMA2 compressed data
case 4:
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// Create a dictionary with size 40MiB, the zimlib uses this
// size while creating
xzReader = new SingleXZInputStream(this.mReader, 4194304);
// Read the first offset
xzReader.read(buffer);
// Read the location of the cluster
byte[] buffer = new byte[8];
final long clusterPos = this.mReader.readEightLittleEndianBytesLong(buffer);
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities
.toFourLittleEndianInteger(buffer);
// Move to the cluster
this.mReader.seek(clusterPos);
// The number of blobs
numberOfBlobs = firstOffset / 4;
// Read the first byte, for compression information
final int compressionType = this.mReader.read();
// The blobNumber has to be lesser than the numberOfBlobs
assert blobNumber < numberOfBlobs;
// Reference declaration
SingleXZInputStream xzReader = null;
int firstOffset, numberOfBlobs, offset1,
offset2,
location,
differenceOffset;
if(blobNumber == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
ByteArrayOutputStream baos;
location = (blobNumber - 1) * 4;
Utilities.skipFully(xzReader, location);
xzReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
// Check the compression type that was read
switch (compressionType) {
xzReader.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
// TODO: Read uncompressed data directly
case 0:
case 1:
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
Utilities.skipFully(xzReader,
(offset1 - 4 * (blobNumber + 2)));
// Create a dictionary with size 40MiB, the zimlib uses this
// size while creating
xzReader.read(buffer, 0, differenceOffset);
// Read the first offset
this.mReader.read(buffer);
baos = new ByteArrayOutputStream();
baos.write(buffer, 0, differenceOffset);
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities.toFourLittleEndianInteger(buffer);
return baos;
// The number of blobs
numberOfBlobs = firstOffset / 4;
}
// The blobNumber has to be lesser than the numberOfBlobs
assert article.blob_number < numberOfBlobs;
if (article.blob_number == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (article.blob_number - 1) * 4;
Utilities.skipFully(this.mReader, location);
this.mReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
}
return null;
}
public DirectoryEntry getDirectoryInfoAtTitlePosition(final int position)
throws IOException {
// Helpers
int pos;
final byte[] buffer = new byte[8];
// At the appropriate position in the titlePtrPos
this.mReader.seek(position);
// Get value of article at index
pos = this.mReader.readFourLittleEndianBytesValue(buffer);
// Move to the position in urlPtrPos
this.mReader.seek(this.mFile.getUrlPtrPos() + 8 * pos);
// Get value of article in urlPtrPos
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
// Go to the location of the directory entry
this.mReader.seek(pos);
final int type = this.mReader.readTwoLittleEndianBytesValue(buffer);
// Ignore the parameter length
this.mReader.read();
final char namespace = (char) this.mReader.read();
// System.out.println("Namepsace: " + namespace);
final int revision = this.mReader.readFourLittleEndianBytesValue(buffer);
// System.out.println("Revision: " + revision);
this.mReader.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
this.mReader.read(buffer, 0, differenceOffset);
baos = new ByteArrayOutputStream();
baos.write(buffer, 0, differenceOffset);
// TODO: Remove redundant if condition code
// Article or Redirect entry
if (type == 65535) {
// System.out.println("MIMEType: " + type);
final int redirectIndex = this.mReader.readFourLittleEndianBytesValue(buffer);
// System.out.println("RedirectIndex: " + redirectIndex);
return baos;
final String url = this.mReader.readString();
// System.out.println("URL: " + url);
// 2 for zlib and 3 for bzip2 (removed)
String title = this.mReader.readString();
title = title.equals("") ? url : title;
// System.out.println("Title: " + title);
return new RedirectEntry(type, namespace, revision, redirectIndex,
url, title, (position - this.mFile.getUrlPtrPos()) / 8);
// LZMA2 compressed data
case 4:
} else {
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// System.out.println("MIMEType: " + mFile.getMIMEType(type));
// Create a dictionary with size 40MiB, the zimlib uses this size while creating
xzReader = new SingleXZInputStream(this.mReader, 4194304);
final int clusterNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
// System.out.println("Cluster Number: " + clusterNumber);
// Read the first offset
xzReader.read(buffer);
final int blobNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
// System.out.println("Blob Number: " + blobNumber);
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities.toFourLittleEndianInteger(buffer);
final String url = this.mReader.readString();
// System.out.println("URL: " + url);
// The number of blobs
numberOfBlobs = firstOffset / 4;
String title = this.mReader.readString();
title = title.equals("") ? url : title;
// System.out.println("Title: " + title);
// Parameter data ignored
// The blobNumber has to be lesser than the numberOfBlobs
assert article.blob_number < numberOfBlobs;
if (article.blob_number == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (article.blob_number - 1) * 4;
Utilities.skipFully(xzReader, location);
xzReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
return new ArticleEntry(type, namespace, revision, clusterNumber,
blobNumber, url, title,
(position - this.mFile.getUrlPtrPos()) / 8);
xzReader.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
xzReader.read(buffer, 0, differenceOffset);
baos = new ByteArrayOutputStream();
baos.write(buffer, 0, differenceOffset);
return baos;
// case 5: zstd compressed (missing!)
default:
return null;
}
}
public ZIMFile getZIMFile() {
return this.mFile;
}
}

@ -18,27 +18,49 @@
package org.openzim;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.openzim.ZIMReader.DirectoryEntry;
public class ZIMTest {
public static void main(final String[] args) {
if(args.length!=2) {
System.out.println("Usage: java ZIMTest <ZIM_FILE> <ARTICLE_NAME>");
if(args.length!=1) {
System.out.println("Usage: java ZIMTest <ZIM_FILE>");
System.exit(0);
}
// args[0] is the Zim File's location
final ZIMFile file = new ZIMFile(args[0]);
try {
// args[0] is the Zim File's location
final ZIMFile file = new ZIMFile(args[0]);
// Associate the Zim File with a Reader
final ZIMReader zReader = new ZIMReader(file);
// Associate the Zim File with a Reader
final ZIMReader zReader = new ZIMReader(file);
// print a list of urls and titles
final List<String> urls = zReader.getURLListByURL();
final List<String> titles = zReader.getURLListByTitle();
int c = Math.min(10, titles.size());
for (int i = 0; i < c; i++) {
System.out.println("URL by URL " + i + ": " + urls.get(i));
System.out.println("URL by Title " + i + ": " + titles.get(i));
DirectoryEntry entry = zReader.getDirectoryInfo(i);
System.out.println("URL by Pos " + i + ": " + entry.url);
System.out.println("Title by Pos " + i + ": " + entry.title);
System.out.println("Namespace by Pos " + i + ": " + entry.namespace);
}
try {
// args[1] is the name of the articles that is
// to be fetched
System.out.println(zReader.getArticleData(args[1],'A').toString("utf-8"));
// print article c-1
DirectoryEntry directory_entry = zReader.getDirectoryInfo(c - 1);
ByteArrayOutputStream articleStream = zReader.getArticleData(directory_entry);
String article = articleStream == null ? "NULL" : articleStream.toString(StandardCharsets.UTF_8.name());
System.out.println(article);
} catch (final IOException e) {
e.printStackTrace();
}
}
}

Loading…
Cancel
Save