Fixed a large number of problems in the ZIM reader.

This library was not prepared for large data because it was missing long data types for pointers. I had to modify the code-base in a fundamental way: - Proof-Reading, - unclustering, - refactoring, - naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format, - change of Exception handling, - extension to more attributes as defined in spec (bugfix for mime type loading) - bugfix to long parsing (prevented reading of large files) The code is furthermore very inefficient and requires more attention. However the format is very useful for YaCy as there are numerous data sources for ZIM-Files.
1 year ago · c2b6b6e7b9
parent 5ba5fb5d23
commit c2b6b6e7b9
8 changed files with 357 additions and 573 deletions
--- a/source/org/openzim/ArticleEntry.java
+++ b/source/org/openzim/ArticleEntry.java
@ -1,46 +0,0 @@
 /*
 * Copyright (C) 2011 Arunesh Mathur
 *
 * This file is a part of zimreader-java.
 *
 * zimreader-java is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License version 3.0 as
 * published by the Free Software Foundation.
 *
 * zimreader-java is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with zimreader-java.  If not, see <http://www.gnu.org/licenses/>.
 */
 package org.openzim;
 public class ArticleEntry extends DirectoryEntry {
    int clusterNumber;
    int blobnumber;
    public ArticleEntry(final int mimeType, final char namespace, final int revision,
            final int clusterNumber, final int blobNumber, final String url, final String title,
            final int urlListindex) {
        super(mimeType, namespace, revision, url, title, urlListindex);
        this.clusterNumber = clusterNumber;
        this.blobnumber = blobNumber;
    }
    public int getClusterNumber() {
        return this.clusterNumber;
    }
    public int getBlobnumber() {
        return this.blobnumber;
    }
 }
--- a/source/org/openzim/DirectoryEntry.java
+++ b/source/org/openzim/DirectoryEntry.java
@ -1,69 +0,0 @@
 /*
 * Copyright (C) 2011 Arunesh Mathur
 *
 * This file is a part of zimreader-java.
 *
 * zimreader-java is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License version 3.0 as
 * published by the Free Software Foundation.
 *
 * zimreader-java is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with zimreader-java.  If not, see <http://www.gnu.org/licenses/>.
 */
 package org.openzim;
 public abstract class DirectoryEntry {
    int mimeType;
    char namespace;
    int revision;
    String url;
    String title;
    int urlListindex;
    public DirectoryEntry(final int mimeType, final char namespace, final int revision,
            final String url, final String title, final int index) {
        this.mimeType = mimeType;
        this.namespace = namespace;
        this.revision = revision;
        this.url = url;
        this.title = title;
        this.urlListindex = index;
    }
    public int getMimeType() {
        return this.mimeType;
    }
    public char getNamespace() {
        return this.namespace;
    }
    public int getRevision() {
        return this.revision;
    }
    public String getUrl() {
        return this.url;
    }
    public String getTitle() {
        return this.title;
    }
    public int getUrlListindex() {
        return this.urlListindex;
    }
 }
--- a/source/org/openzim/RandomAcessFileZIMInputStream.java
+++ b/source/org/openzim/RandomAcessFileZIMInputStream.java
@ -28,6 +28,8 @@ import java.io.RandomAccessFile;
 * implementation, can be improved.
 *
 * @author Arunesh Mathur <aruneshmathur1990 at gmail.com>
 * @author Michael Christen
 *         bugfix to long parsing (return value was int)
 */
 public class RandomAcessFileZIMInputStream extends InputStream {
@ -41,7 +43,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
    }
    // TODO: Remove the parameter buffer
-    public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException {
+    public int readTwoLittleEndianBytesInt(final byte[] buffer) throws IOException {
        if (buffer.length < 2) {
            throw new OutOfMemoryError("buffer too small");
        } else {
@ -51,7 +53,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
    }
    // TODO: Remove the parameter buffer
-    public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException {
+    public int readFourLittleEndianBytesInt(final byte[] buffer) throws IOException {
        if (buffer.length < 4) {
            throw new OutOfMemoryError("buffer too small");
        } else {
@ -61,30 +63,30 @@ public class RandomAcessFileZIMInputStream extends InputStream {
    }
    // TODO: Remove the parameter buffer
-    public int readEightLittleEndianBytesValue(final byte[] buffer)
+    public long readEightLittleEndianBytesLong(final byte[] buffer)
            throws IOException {
        if (buffer.length < 8) {
            throw new OutOfMemoryError("buffer too small");
        } else {
            this.mRAFReader.read(buffer, 0, 8);
-            return Utilities.toEightLittleEndianInteger(buffer);
+            return Utilities.toEightLittleEndianLong(buffer);
        }
    }
    // TODO: Remove the parameter buffer
-    public int readSixteenLittleEndianBytesValue(final byte[] buffer)
+    public long readSixteenLittleEndianBytesLong(final byte[] buffer)
            throws IOException {
        if (buffer.length < 16) {
            throw new OutOfMemoryError("buffer too small");
        } else {
            this.mRAFReader.read(buffer, 0, 16);
-            return Utilities.toSixteenLittleEndianInteger(buffer);
+            return Utilities.toSixteenLittleEndianLong(buffer);
        }
    }
    // Reads characters from the current position into a String and stops when a
    // '\0' is encountered
-    public String readString() throws IOException {
+    public String readZeroTerminatedString() throws IOException {
        final StringBuffer sb = new StringBuffer();
        /*
         * int i; byte[] buffer = new byte[100]; while (true) {
@ -92,8 +94,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
         * (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i
         * != buffer.length) break; } return sb.toString();
         */
-        int b;
+        int b = this.mRAFReader.read();
        b = this.mRAFReader.read();
        while (b != '\0') {
            sb.append((char) b);
            b = this.mRAFReader.read();
--- a/source/org/openzim/RedirectEntry.java
+++ b/source/org/openzim/RedirectEntry.java
@ -1,37 +0,0 @@
 /*
 * Copyright (C) 2011 Arunesh Mathur
 *
 * This file is a part of zimreader-java.
 *
 * zimreader-java is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License version 3.0 as
 * published by the Free Software Foundation.
 *
 * zimreader-java is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with zimreader-java.  If not, see <http://www.gnu.org/licenses/>.
 */
 package org.openzim;
 public class RedirectEntry extends DirectoryEntry {
    int redirectIndex;
    public RedirectEntry(final int mimeType, final char namespace, final int revision,
            final int redirectIndex, final String url, final String title, final int urlListindex) {
        super(mimeType, namespace, revision, url, title, urlListindex);
        this.redirectIndex = redirectIndex;
    }
    public int getRedirectIndex() {
        return this.redirectIndex;
    }
 }
--- a/source/org/openzim/Utilities.java
+++ b/source/org/openzim/Utilities.java
@ -22,18 +22,21 @@ package org.openzim;
 import java.io.IOException;
 import java.io.InputStream;
 /**
 * @author Arunesh Mathur
 *         A ZIM file implementation that stores the Header and the MIMETypeList
 *
 * @author Michael Christen
 *         int/long bugfix (did reading of long values with int variables, causing negative offsets)
 */
 public class Utilities {
    // TODO: Write a binary search algorithm
    public static int binarySearch() {
        return -1;
    }
    public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException {
        if (buffer.length < 2) {
            throw new OutOfMemoryError("buffer too small");
        } else {
-            final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
+            final int result =
                      ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
            return result;
        }
    }
@ -42,39 +45,28 @@ public class Utilities {
        if (buffer.length < 4) {
            throw new OutOfMemoryError("buffer too small");
        } else {
-            final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
+            final int result =
                      ((buffer[0] & 0xFF)        | ((buffer[1] & 0xFF) << 8)
                    | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
            return result;
        }
    }
-    public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException {
+    public static long toEightLittleEndianLong(final byte[] buffer) throws IOException {
        if (buffer.length < 8) {
            throw new OutOfMemoryError("buffer too small");
        } else {
-            final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
+            final long result = // cast to long required otherwise this is again an integer
-                    | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
+                      ((long)(buffer[0] & 0xFF)        | ((long)(buffer[1] & 0xFF) << 8)
-                    | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
+                    | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24)
-                    | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56));
+                    | ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40)
                    | ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56));
            return result;
        }
    }
-    public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException {
+    public static long toSixteenLittleEndianLong(final byte[] buffer) throws IOException {
-        if (buffer.length < 16) {
+        return toEightLittleEndianLong(buffer); // there are no sixten bytes long values
            throw new OutOfMemoryError("buffer too small");
        } else {
            final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
                    | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
                    | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
                    | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)
                    | ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72)
                    | ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88)
                    | ((buffer[12] & 0xFF) << 96)
                    | ((buffer[13] & 0xFF) << 104)
                    | ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120));
            return result;
        }
    }
    public static void skipFully(final InputStream stream, final long bytes) throws IOException {
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@ -20,46 +20,47 @@ package org.openzim;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.util.ArrayList;
 import java.util.List;
 /**
 * @author Arunesh Mathur
 *
 *         A ZIM file implementation that stores the Header and the MIMETypeList
 *
 * @author Michael Christen
 *         Proof-Reading, unclustering, refactoring,
 *         naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
 *         change of Exception handling, 
 *         extension to more attributes as defined in spec (bugfix for mime type loading)
 *         int/long bugfix (did reading of long values with int variables, causing negative offsets)
 */
 public class ZIMFile extends File {
    /**
     *
     */
    private static final long serialVersionUID = 1L;
-    private Header mHeader;
+    // Header values
-
+    public final int  header_magicNumber;
-    private List<String> mMIMETypeList; // Can be removed if not needed
+    public final int  header_majorVersion;
-
+    public final int  header_minorVersion;
-    public ZIMFile(final String path) {
+    public final long header_uuid;
    public final int  header_entryCount;
    public final int  header_clusterCount;
    public final long header_urlPtrPos;
    public final long header_titlePtrPos;
    public final long header_clusterPtrPos;
    public final long header_mimeListPos;
    public final int  header_mainPage;
    public final int  header_layoutPage;
    public final long header_checksumPos;
    // content cache
    public final List<String> mimeList;
    public ZIMFile(final String path) throws IOException {
        super(path);
        try {
            readHeader();
        } catch (final FileNotFoundException e) {
            e.printStackTrace();
        }
    }
    private void readHeader() throws FileNotFoundException {
        // Helpers
        int len = 0;
        StringBuffer mimeBuffer = null;
        // The byte[] that will help us in reading bytes out of the file
        final byte[] buffer = new byte[16];
        // Check whether the file exists
        if (!(this.exists())) {
            throw new FileNotFoundException(
@ -67,132 +68,45 @@ public class ZIMFile extends File {
        }
        // The reader that will be used to read contents from the file
-
+        final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this, "r"));
-        final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(
+        final byte[] buffer = new byte[16];
                new RandomAccessFile(this, "r"));
        // The ZIM file header
        this.mHeader = new Header();
        // Read the contents of the header
-        try {
+        this.header_magicNumber   = reader.readFourLittleEndianBytesInt(buffer);     //  4
-            this.mHeader.magicNumber = reader.readFourLittleEndianBytesValue(buffer);
+        this.header_majorVersion  = reader.readTwoLittleEndianBytesInt(buffer);      //  2
-            // System.out.println(mHeader.magicNumber);
+        this.header_minorVersion  = reader.readTwoLittleEndianBytesInt(buffer);      //  4
-
+        this.header_uuid          = reader.readSixteenLittleEndianBytesLong(buffer); // 16
-            this.mHeader.version = reader.readFourLittleEndianBytesValue(buffer);
+        this.header_entryCount    = reader.readFourLittleEndianBytesInt(buffer);     //  4
-            // System.out.println(mHeader.version);
+        this.header_clusterCount  = reader.readFourLittleEndianBytesInt(buffer);     //  4
-
+        this.header_urlPtrPos     = reader.readEightLittleEndianBytesLong(buffer);   //  8
-            this.mHeader.uuid = reader.readSixteenLittleEndianBytesValue(buffer);
+        this.header_titlePtrPos   = reader.readEightLittleEndianBytesLong(buffer);   //  8
-            // System.out.println(mHeader.uuid); reader.read(buffer, 0, 4);
+        this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(buffer);   //  8
-
+        this.header_mimeListPos   = reader.readEightLittleEndianBytesLong(buffer);   //  8
-            this.mHeader.articleCount = reader
+        this.header_mainPage      = reader.readFourLittleEndianBytesInt(buffer);     //  4
-                    .readFourLittleEndianBytesValue(buffer);
+        this.header_layoutPage    = reader.readFourLittleEndianBytesInt(buffer);     //  4
-            // System.out.println(mHeader.articleCount);
+        this.header_checksumPos   = reader.readEightLittleEndianBytesLong(buffer);   //  8 [FIX!]
-
+
-            this.mHeader.clusterCount = reader
+        // Initialise the MIMETypeList
-                    .readFourLittleEndianBytesValue(buffer);
+        int len = 0;
-            // System.out.println(mHeader.clusterCount);
+        StringBuffer mimeBuffer = null;
-
+        this.mimeList = new ArrayList<>();
-            this.mHeader.urlPtrPos = reader.readEightLittleEndianBytesValue(buffer);
+        while (true) {
-            // System.out.println(mHeader.urlPtrPos);
+            reader.read(buffer, 0, 1); // read only one byte to check if this is a zero
-
+            len = 0;
-            this.mHeader.titlePtrPos = reader
+            mimeBuffer = new StringBuffer();
-                    .readEightLittleEndianBytesValue(buffer);
+            while (buffer[0] != '\0') {
-            // System.out.println(mHeader.titlePtrPos);
+                mimeBuffer.append((char) buffer[0]);
            this.mHeader.clusterPtrPos = reader
                    .readEightLittleEndianBytesValue(buffer);
            // System.out.println(mHeader.clusterPtrPos);
            this.mHeader.mimeListPos = reader
                    .readEightLittleEndianBytesValue(buffer);
            // System.out.println(mHeader.mimeListPos);
            this.mHeader.mainPage = reader.readFourLittleEndianBytesValue(buffer);
            // System.out.println(mHeader.mainPage);
            this.mHeader.layoutPage = reader.readFourLittleEndianBytesValue(buffer);
            // System.out.println(mHeader.layoutPage);
            // Initialise the MIMETypeList
            this.mMIMETypeList = new ArrayList<>();
            while (true) {
                reader.read(buffer, 0, 1);
-                len = 0;
+                len++;
                mimeBuffer = new StringBuffer();
                while (buffer[0] != '\0') {
                    mimeBuffer.append((char) buffer[0]);
                    reader.read(buffer, 0, 1);
                    len++;
                }
                if (len == 0) {
                    break;
                }
                this.mMIMETypeList.add(mimeBuffer.toString());
                // System.out.println(mimeBuffer);
            }
-
+            if (len == 0) {
-        } catch (final Exception e) {
+                break;
-            e.printStackTrace();
+            }
            String mimeType = mimeBuffer.toString();
            System.out.println(mimeType);
            this.mimeList.add(mimeType);
        }
    }
    public int getVersion() {
        return this.mHeader.version;
    }
    public int getUuid() {
        return this.mHeader.uuid;
    }
    public int getArticleCount() {
        return this.mHeader.articleCount;
    }
    public int getClusterCount() {
        return this.mHeader.clusterCount;
    }
    public int getUrlPtrPos() {
        return this.mHeader.urlPtrPos;
    }
    public int getTitlePtrPos() {
        return this.mHeader.titlePtrPos;
    }
    public int getClusterPtrPos() {
        return this.mHeader.clusterPtrPos;
    }
    public String getMIMEType(final int mimeNumber) {
        return this.mMIMETypeList.get(mimeNumber);
    }
    public int getHeaderSize() {
        return this.mHeader.mimeListPos;
    }
    public int getMainPage() {
        return this.mHeader.mainPage;
    }
    public int getLayoutPage() {
        return this.mHeader.layoutPage;
    }
    public class Header {
        int magicNumber;
        int version;
        int uuid;
        int articleCount;
        int clusterCount;
        int urlPtrPos;
        int titlePtrPos;
        int clusterPtrPos;
        int mimeListPos;
        int mainPage;
        int layoutPage;
    }
 }
--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@ -29,28 +29,91 @@ import org.tukaani.xz.SingleXZInputStream;
 /**
 * @author Arunesh Mathur
 *
 *         A ZIMReader that reads data from the ZIMFile
 *
 * @author Michael Christen
 *         Proof-Reading, unclustering, refactoring,
 *         naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
 *         change of Exception handling, 
 *         extension to more attributes as defined in spec (bugfix for mime type loading)
 *         bugfix to long parsing (prevented reading of large files)
 */
 public class ZIMReader {
    private final ZIMFile mFile;
    private RandomAcessFileZIMInputStream mReader;
    public static abstract class DirectoryEntry {
        public final int mimetype;
        public final char namespace;
        public final int cluster_number;
        public final String url;
        public final String title;
        public final long urlListindex;
        public DirectoryEntry(
                final int mimeType, final char namespace,
                final int cluster_number,
                final String url, final String title,
                final long index) {
            this.mimetype = mimeType;
            this.namespace = namespace;
            this.cluster_number = cluster_number;
            this.url = url;
            this.title = title;
            this.urlListindex = index;
        }
    }
    public static class ArticleEntry extends DirectoryEntry {
        public final int cluster_number;
        public final int blob_number;
        public ArticleEntry(
                final int mimeType, final char namespace,
                final int cluster_number, final int blob_number,
                final String url, final String title,
                final long urlListindex) {
            super(mimeType, namespace, cluster_number, url, title, urlListindex);
            this.cluster_number = cluster_number;
            this.blob_number = blob_number;
        }
    }
    public static class RedirectEntry extends DirectoryEntry {
        public final long redirect_index;
        public RedirectEntry(final int mimeType, final char namespace,
                final long redirect_index, final String url, final String title,
                final long urlListindex) {
            super(mimeType, namespace, 0, url, title, urlListindex);
            this.redirect_index = redirect_index;
        }
    }
    public ZIMReader(final ZIMFile file) {
        this.mFile = file;
        try {
-            this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(
+            this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this.mFile, "r"));
                    this.mFile, "r"));
        } catch (final FileNotFoundException e) {
            e.printStackTrace();
        }
    }
    public ZIMFile getZIMFile() {
        return this.mFile;
    }
    // get a URL list that is sorted by the urls
    public List<String> getURLListByURL() throws IOException {
-        int i = 0, pos, mimeType;
+        int i = 0, mimeType;
        final byte[] buffer = new byte[8];
@ -58,12 +121,12 @@ public class ZIMReader {
        final ArrayList<String> returnList = new ArrayList<>();
        // Move to the spot where URL's are listed
-        this.mReader.seek(this.mFile.getUrlPtrPos());
+        this.mReader.seek(this.mFile.header_urlPtrPos);
-        for (i = 0; i < this.mFile.getArticleCount(); i++) {
+        for (i = 0; i < this.mFile.header_entryCount; i++) {
            // The position of URL i
-            pos = this.mReader.readEightLittleEndianBytesValue(buffer);
+            long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
            // Mark the current position that we need to return to
            this.mReader.mark();
@ -72,14 +135,14 @@ public class ZIMReader {
            this.mReader.seek(pos);
            // Article or Redirect entry?
-            mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
+            mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
            if (mimeType == 65535) {
                this.mReader.seek(pos + 12);
-                returnList.add(this.mReader.readString());
+                returnList.add(this.mReader.readZeroTerminatedString());
            } else {
                this.mReader.seek(pos + 16);
-                returnList.add(this.mReader.readString());
+                returnList.add(this.mReader.readZeroTerminatedString());
            }
            this.mReader.reset();
@ -88,9 +151,10 @@ public class ZIMReader {
        return returnList;
    }
    // get a URL list that is sorted by the entry titles
    public List<String> getURLListByTitle() throws IOException {
-        int i = 0, pos, mimeType, articleNumber, urlPtrPos;
+        int i = 0, mimeType, articleNumber;
        final byte[] buffer = new byte[8];
@ -98,35 +162,35 @@ public class ZIMReader {
        final ArrayList<String> returnList = new ArrayList<>();
        // Get the UrlPtrPos or one time storage
-        urlPtrPos = this.mFile.getUrlPtrPos();
+        long urlPtrPos = this.mFile.header_urlPtrPos;
        // Move to the spot where URL's are listed
-        this.mReader.seek(this.mFile.getTitlePtrPos());
+        this.mReader.seek(this.mFile.header_titlePtrPos);
-        for (i = 0; i < this.mFile.getArticleCount(); i++) {
+        for (i = 0; i < this.mFile.header_entryCount; i++) {
            // The articleNumber of the position of URL i
-            articleNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
+            articleNumber = this.mReader.readFourLittleEndianBytesInt(buffer);
            // Mark the current position that we need to return to
            this.mReader.mark();
-            this.mReader.seek(urlPtrPos + (8 * (articleNumber)));
+            this.mReader.seek(urlPtrPos + (8L * (articleNumber)));
            // The position of URL i
-            pos = this.mReader.readEightLittleEndianBytesValue(buffer);
+            long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
            this.mReader.seek(pos);
            // Article or Redirect entry?
-            mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
+            mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
            if (mimeType == 65535) {
                this.mReader.seek(pos + 12);
-                final String url = this.mReader.readString();
+                final String url = this.mReader.readZeroTerminatedString();
                returnList.add(url);
            } else {
                this.mReader.seek(pos + 16);
-                final String url = this.mReader.readString();
+                final String url = this.mReader.readZeroTerminatedString();
                returnList.add(url);
            }
@ -137,14 +201,69 @@ public class ZIMReader {
        return returnList;
    }
    // position must be the seek position for the title in the Title Pointer List
    private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException {
        // Helpers
        final byte[] buffer = new byte[8];
        // At the appropriate position in the titlePtrPos
        this.mReader.seek(position);
        // Get value of article at index
        int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(buffer);
        // Move to the position in urlPtrPos
        this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer);
        // Get value of article in urlPtrPos
        long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(buffer);
        // Go to the location of the directory entry
        this.mReader.seek(pointer_to_the_directory_entry);
        // read the Content Entry
        final int type = this.mReader.readTwoLittleEndianBytesInt(buffer); // 2, 0xffff for redirect
        this.mReader.read();                                               // 1, ignore, parameter length not used
        final char namespace = (char) this.mReader.read();                 // 1
        this.mReader.readFourLittleEndianBytesInt(buffer);                 // 4, ignore, revision not used
        // Article or Redirect entry
        if (type == 65535) {
            final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(buffer);
            final String url = this.mReader.readZeroTerminatedString();
            String title = this.mReader.readZeroTerminatedString();
            title = title.equals("") ? url : title;
            return new RedirectEntry(type, namespace, redirectIndex,
                    url, title, (position - this.mFile.header_urlPtrPos) / 8);
        } else {
            final int cluster_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4
            final int blob_number = this.mReader.readFourLittleEndianBytesInt(buffer);    // 4
            final String url = this.mReader.readZeroTerminatedString();                     // zero terminated
            String title = this.mReader.readZeroTerminatedString();                         // zero terminated
            title = title.equals("") ? url : title;
            return new ArticleEntry(
                    type, namespace,
                    cluster_number, blob_number,
                    url, title, (position - this.mFile.header_urlPtrPos) / 8);
        }
    }
    public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException {
        if (entryNumber >= this.mFile.header_entryCount) throw new IOException("entryNumber exceeds entryCount");
        return getDirectoryInfoAtTitlePosition(this.mFile.header_titlePtrPos + 4 * entryNumber);
    }
    // Gives the minimum required information needed for the given articleName
-    public DirectoryEntry getDirectoryInfo(String articleName, final char namespace)
+    // This makes a binary search on the article name entry list.
-            throws IOException {
+    public DirectoryEntry getDirectoryInfo(final char namespace, String articleName) throws IOException {
        DirectoryEntry entry;
        String cmpStr;
-        final int numberOfArticles = this.mFile.getArticleCount();
+        final int numberOfArticles = this.mFile.header_entryCount;
-        int beg = this.mFile.getTitlePtrPos(), end = beg + (numberOfArticles * 4), mid;
+        long beg = this.mFile.header_titlePtrPos, end = beg + (numberOfArticles * 4), mid;
        articleName = namespace + "/" + articleName;
@ -154,7 +273,7 @@ public class ZIMReader {
            if (entry == null) {
                return null;
            }
-            cmpStr = entry.getNamespace() + "/" + entry.getUrl();
+            cmpStr = entry.namespace + "/" + entry.url;
            if (articleName.compareTo(cmpStr) < 0) {
                end = mid - 4;
@ -167,242 +286,130 @@ public class ZIMReader {
        }
        return null;
    }
-    public ByteArrayOutputStream getArticleData(final String articleName, final char namespace) throws IOException {
+    public ByteArrayOutputStream getArticleData(final DirectoryEntry directoryInfo) throws IOException {
        // search in the cache first, if not found, then call getDirectoryInfo(articleName)
        byte[] buffer = new byte[8];
        final DirectoryEntry mainEntry = getDirectoryInfo(articleName, namespace);
        if (mainEntry != null) {
            // Check what kind of an entry was mainEnrty
            if (mainEntry.getClass() == ArticleEntry.class) {
                // Cast to ArticleEntry
                final ArticleEntry article = (ArticleEntry) mainEntry;
                // Get the cluster and blob numbers from the article
                final int clusterNumber = article.getClusterNumber();
                final int blobNumber = article.getBlobnumber();
                // Move to the cluster entry in the clusterPtrPos
                this.mReader.seek(this.mFile.getClusterPtrPos() + clusterNumber * 8);
                // Read the location of the cluster
                final int clusterPos = this.mReader
                        .readEightLittleEndianBytesValue(buffer);
                // Move to the cluster
                this.mReader.seek(clusterPos);
-                // Read the first byte, for compression information
+        // fail fast
-                final int compressionType = this.mReader.read();
+        if (directoryInfo == null) return null;
        if (directoryInfo.getClass() != ArticleEntry.class) return null;
-                // Reference declaration
+        // This is now an article, so thus we can cast to ArticleEntry
-                SingleXZInputStream xzReader = null;
+        final ArticleEntry article = (ArticleEntry) directoryInfo;
                int firstOffset, numberOfBlobs, offset1,
                offset2,
                location,
                differenceOffset;
-                ByteArrayOutputStream baos;
+        // Move to the cluster entry in the clusterPtrPos
        this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8);
-                // Check the compression type that was read
+        // Read the location of the cluster
-                switch (compressionType) {
+        byte[] buffer = new byte[8];
-
+        final long clusterPos = this.mReader.readEightLittleEndianBytesLong(buffer);
                // TODO: Read uncompressed data directly
                case 0:
                case 1:
                    // Read the first 4 bytes to find out the number of artciles
                    buffer = new byte[4];
                    // Create a dictionary with size 40MiB, the zimlib uses this
                    // size while creating
                    // Read the first offset
                    this.mReader.read(buffer);
                    // The first four bytes are the offset of the zeroth blob
                    firstOffset = Utilities
                            .toFourLittleEndianInteger(buffer);
                    // The number of blobs
                    numberOfBlobs = firstOffset / 4;
                    // The blobNumber has to be lesser than the numberOfBlobs
                    assert blobNumber < numberOfBlobs;
                    if (blobNumber == 0) {
                        // The first offset is what we read earlier
                        offset1 = firstOffset;
                    } else {
                        location = (blobNumber - 1) * 4;
                        Utilities.skipFully(this.mReader, location);
                        this.mReader.read(buffer);
                        offset1 = Utilities.toFourLittleEndianInteger(buffer);
                    }
                    this.mReader.read(buffer);
                    offset2 = Utilities.toFourLittleEndianInteger(buffer);
                    differenceOffset = offset2 - offset1;
                    buffer = new byte[differenceOffset];
                    Utilities.skipFully(this.mReader,
                            (offset1 - 4 * (blobNumber + 2)));
                    this.mReader.read(buffer, 0, differenceOffset);
                    baos = new ByteArrayOutputStream();
                    baos.write(buffer, 0, differenceOffset);
                    return baos;
                // LZMA2 compressed data
                case 4:
                    // Read the first 4 bytes to find out the number of artciles
                    buffer = new byte[4];
                    // Create a dictionary with size 40MiB, the zimlib uses this
                    // size while creating
                    xzReader = new SingleXZInputStream(this.mReader, 4194304);
                    // Read the first offset
                    xzReader.read(buffer);
-                    // The first four bytes are the offset of the zeroth blob
+        // Move to the cluster
-                    firstOffset = Utilities
+        this.mReader.seek(clusterPos);
                            .toFourLittleEndianInteger(buffer);
-                    // The number of blobs
+        // Read the first byte, for compression information
-                    numberOfBlobs = firstOffset / 4;
+        final int compressionType = this.mReader.read();
-                    // The blobNumber has to be lesser than the numberOfBlobs
+        // Reference declaration
-                    assert blobNumber < numberOfBlobs;
+        SingleXZInputStream xzReader = null;
        int firstOffset, numberOfBlobs, offset1,
        offset2,
        location,
        differenceOffset;
-                    if(blobNumber == 0) {
+        ByteArrayOutputStream baos;
                        // The first offset is what we read earlier
                        offset1 = firstOffset;
                    } else {
-                        location = (blobNumber - 1) * 4;
+        // Check the compression type that was read
-                        Utilities.skipFully(xzReader, location);
+        switch (compressionType) {
                        xzReader.read(buffer);
                        offset1 = Utilities.toFourLittleEndianInteger(buffer);
                    }
-                    xzReader.read(buffer);
+        // TODO: Read uncompressed data directly
-                    offset2 = Utilities.toFourLittleEndianInteger(buffer);
+        case 0:
        case 1:
-                    differenceOffset = offset2 - offset1;
+            // Read the first 4 bytes to find out the number of artciles
-                    buffer = new byte[differenceOffset];
+            buffer = new byte[4];
-                    Utilities.skipFully(xzReader,
+            // Create a dictionary with size 40MiB, the zimlib uses this
-                            (offset1 - 4 * (blobNumber + 2)));
+            // size while creating
-                    xzReader.read(buffer, 0, differenceOffset);
+            // Read the first offset
            this.mReader.read(buffer);
-                    baos = new ByteArrayOutputStream();
+            // The first four bytes are the offset of the zeroth blob
-                    baos.write(buffer, 0, differenceOffset);
+            firstOffset = Utilities.toFourLittleEndianInteger(buffer);
-                    return baos;
+            // The number of blobs
            numberOfBlobs = firstOffset / 4;
-                }
+            // The blobNumber has to be lesser than the numberOfBlobs
            assert article.blob_number < numberOfBlobs;
            if (article.blob_number == 0) {
                // The first offset is what we read earlier
                offset1 = firstOffset;
            } else {
                location = (article.blob_number - 1) * 4;
                Utilities.skipFully(this.mReader, location);
                this.mReader.read(buffer);
                offset1 = Utilities.toFourLittleEndianInteger(buffer);
            }
        }
        return null;
    }
    public DirectoryEntry getDirectoryInfoAtTitlePosition(final int position)
            throws IOException {
        // Helpers
        int pos;
        final byte[] buffer = new byte[8];
        // At the appropriate position in the titlePtrPos
        this.mReader.seek(position);
        // Get value of article at index
        pos = this.mReader.readFourLittleEndianBytesValue(buffer);
        // Move to the position in urlPtrPos
        this.mReader.seek(this.mFile.getUrlPtrPos() + 8 * pos);
        // Get value of article in urlPtrPos
        pos = this.mReader.readEightLittleEndianBytesValue(buffer);
        // Go to the location of the directory entry
        this.mReader.seek(pos);
        final int type = this.mReader.readTwoLittleEndianBytesValue(buffer);
        // Ignore the parameter length
        this.mReader.read();
        final char namespace = (char) this.mReader.read();
        // System.out.println("Namepsace: " + namespace);
-        final int revision = this.mReader.readFourLittleEndianBytesValue(buffer);
+            this.mReader.read(buffer);
-        // System.out.println("Revision: " + revision);
+            offset2 = Utilities.toFourLittleEndianInteger(buffer);
            differenceOffset = offset2 - offset1;
            buffer = new byte[differenceOffset];
            Utilities.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
            this.mReader.read(buffer, 0, differenceOffset);
            baos = new ByteArrayOutputStream();
            baos.write(buffer, 0, differenceOffset);
-        // TODO: Remove redundant if condition code
+            return baos;
        // Article or Redirect entry
        if (type == 65535) {
            // System.out.println("MIMEType: " + type);
            final int redirectIndex = this.mReader.readFourLittleEndianBytesValue(buffer);
            // System.out.println("RedirectIndex: " + redirectIndex);
-            final String url = this.mReader.readString();
+        // 2 for zlib and 3 for bzip2 (removed)
            // System.out.println("URL: " + url);
-            String title = this.mReader.readString();
+        // LZMA2 compressed data
-            title = title.equals("") ? url : title;
+        case 4:
            // System.out.println("Title: " + title);
            return new RedirectEntry(type, namespace, revision, redirectIndex,
                    url, title, (position - this.mFile.getUrlPtrPos()) / 8);
-        } else {
+            // Read the first 4 bytes to find out the number of artciles
            buffer = new byte[4];
-            // System.out.println("MIMEType: " + mFile.getMIMEType(type));
+            // Create a dictionary with size 40MiB, the zimlib uses this size while creating
            xzReader = new SingleXZInputStream(this.mReader, 4194304);
-            final int clusterNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
+            // Read the first offset
-            // System.out.println("Cluster Number: " + clusterNumber);
+            xzReader.read(buffer);
-            final int blobNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
+            // The first four bytes are the offset of the zeroth blob
-            // System.out.println("Blob Number: " + blobNumber);
+            firstOffset = Utilities.toFourLittleEndianInteger(buffer);
-            final String url = this.mReader.readString();
+            // The number of blobs
-            // System.out.println("URL: " + url);
+            numberOfBlobs = firstOffset / 4;
-            String title = this.mReader.readString();
+            // The blobNumber has to be lesser than the numberOfBlobs
-            title = title.equals("") ? url : title;
+            assert article.blob_number < numberOfBlobs;
-            // System.out.println("Title: " + title);
+            if (article.blob_number == 0) {
-
+                // The first offset is what we read earlier
-            // Parameter data ignored
+                offset1 = firstOffset;
            } else {
                location = (article.blob_number - 1) * 4;
                Utilities.skipFully(xzReader, location);
                xzReader.read(buffer);
                offset1 = Utilities.toFourLittleEndianInteger(buffer);
            }
-            return new ArticleEntry(type, namespace, revision, clusterNumber,
+            xzReader.read(buffer);
-                    blobNumber, url, title,
+            offset2 = Utilities.toFourLittleEndianInteger(buffer);
-                    (position - this.mFile.getUrlPtrPos()) / 8);
+            differenceOffset = offset2 - offset1;
            buffer = new byte[differenceOffset];
            Utilities.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
            xzReader.read(buffer, 0, differenceOffset);
            baos = new ByteArrayOutputStream();
            baos.write(buffer, 0, differenceOffset);
            return baos;
        // case 5: zstd compressed (missing!)
        default:
            return null;
        }
    }
    public ZIMFile getZIMFile() {
        return this.mFile;
    }
 }
--- a/source/org/openzim/ZIMTest.java
+++ b/source/org/openzim/ZIMTest.java
@ -18,27 +18,49 @@
 package org.openzim;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
 import org.openzim.ZIMReader.DirectoryEntry;
 public class ZIMTest {
    public static void main(final String[] args) {
-        if(args.length!=2) {
+        if(args.length!=1) {
-            System.out.println("Usage: java ZIMTest <ZIM_FILE> <ARTICLE_NAME>");
+            System.out.println("Usage: java ZIMTest <ZIM_FILE>");
            System.exit(0);
        }
-        // args[0] is the Zim File's location
+        try {
-        final ZIMFile file = new ZIMFile(args[0]);
+            // args[0] is the Zim File's location
            final ZIMFile file = new ZIMFile(args[0]);
            // Associate the Zim File with a Reader
            final ZIMReader zReader = new ZIMReader(file);
-        // Associate the Zim File with a Reader
+            // print a list of urls and titles
-        final ZIMReader zReader = new ZIMReader(file);
+            final List<String> urls = zReader.getURLListByURL();
            final List<String> titles = zReader.getURLListByTitle();
            int c = Math.min(10, titles.size());
            for (int i = 0; i < c; i++) {
                System.out.println("URL by URL   " + i + ": " + urls.get(i));
                System.out.println("URL by Title " + i + ": " + titles.get(i));
                DirectoryEntry entry = zReader.getDirectoryInfo(i);
                System.out.println("URL   by Pos " + i + ": " + entry.url);
                System.out.println("Title by Pos " + i + ": " + entry.title);
                System.out.println("Namespace by Pos " + i + ": " + entry.namespace);
            }
-        try {
+            // print article c-1
-            // args[1] is the name of the articles that is
+            DirectoryEntry directory_entry = zReader.getDirectoryInfo(c - 1);
-             // to be fetched
+            ByteArrayOutputStream articleStream = zReader.getArticleData(directory_entry);
-            System.out.println(zReader.getArticleData(args[1],'A').toString("utf-8"));
+            String article = articleStream == null ? "NULL" : articleStream.toString(StandardCharsets.UTF_8.name());
            System.out.println(article);
        } catch (final IOException e) {
            e.printStackTrace();
        }
    }
 }