diff --git a/source/org/openzim/ArticleEntry.java b/source/org/openzim/ArticleEntry.java
deleted file mode 100644
index 7eeae2e06..000000000
--- a/source/org/openzim/ArticleEntry.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (C) 2011 Arunesh Mathur
- *
- * This file is a part of zimreader-java.
- *
- * zimreader-java is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License version 3.0 as
- * published by the Free Software Foundation.
- *
- * zimreader-java is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with zimreader-java. If not, see .
- */
-
-
-package org.openzim;
-
-public class ArticleEntry extends DirectoryEntry {
-
- int clusterNumber;
-
- int blobnumber;
-
- public ArticleEntry(final int mimeType, final char namespace, final int revision,
- final int clusterNumber, final int blobNumber, final String url, final String title,
- final int urlListindex) {
-
- super(mimeType, namespace, revision, url, title, urlListindex);
-
- this.clusterNumber = clusterNumber;
- this.blobnumber = blobNumber;
- }
-
- public int getClusterNumber() {
- return this.clusterNumber;
- }
-
- public int getBlobnumber() {
- return this.blobnumber;
- }
-
-}
diff --git a/source/org/openzim/DirectoryEntry.java b/source/org/openzim/DirectoryEntry.java
deleted file mode 100644
index 92c52de41..000000000
--- a/source/org/openzim/DirectoryEntry.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (C) 2011 Arunesh Mathur
- *
- * This file is a part of zimreader-java.
- *
- * zimreader-java is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License version 3.0 as
- * published by the Free Software Foundation.
- *
- * zimreader-java is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with zimreader-java. If not, see .
- */
-
-package org.openzim;
-
-public abstract class DirectoryEntry {
-
- int mimeType;
-
- char namespace;
-
- int revision;
-
- String url;
-
- String title;
-
- int urlListindex;
-
- public DirectoryEntry(final int mimeType, final char namespace, final int revision,
- final String url, final String title, final int index) {
- this.mimeType = mimeType;
- this.namespace = namespace;
- this.revision = revision;
- this.url = url;
- this.title = title;
- this.urlListindex = index;
- }
-
- public int getMimeType() {
- return this.mimeType;
- }
-
- public char getNamespace() {
- return this.namespace;
- }
-
- public int getRevision() {
- return this.revision;
- }
-
- public String getUrl() {
- return this.url;
- }
-
- public String getTitle() {
- return this.title;
- }
-
- public int getUrlListindex() {
- return this.urlListindex;
- }
-
-}
diff --git a/source/org/openzim/RandomAcessFileZIMInputStream.java b/source/org/openzim/RandomAcessFileZIMInputStream.java
index 006dd4498..cb6cdb093 100644
--- a/source/org/openzim/RandomAcessFileZIMInputStream.java
+++ b/source/org/openzim/RandomAcessFileZIMInputStream.java
@@ -28,6 +28,8 @@ import java.io.RandomAccessFile;
* implementation, can be improved.
*
* @author Arunesh Mathur
+ * @author Michael Christen
+ * bugfix to long parsing (return value was int)
*/
public class RandomAcessFileZIMInputStream extends InputStream {
@@ -41,7 +43,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
}
// TODO: Remove the parameter buffer
- public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException {
+ public int readTwoLittleEndianBytesInt(final byte[] buffer) throws IOException {
if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small");
} else {
@@ -51,7 +53,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
}
// TODO: Remove the parameter buffer
- public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException {
+ public int readFourLittleEndianBytesInt(final byte[] buffer) throws IOException {
if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small");
} else {
@@ -61,30 +63,30 @@ public class RandomAcessFileZIMInputStream extends InputStream {
}
// TODO: Remove the parameter buffer
- public int readEightLittleEndianBytesValue(final byte[] buffer)
+ public long readEightLittleEndianBytesLong(final byte[] buffer)
throws IOException {
if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 8);
- return Utilities.toEightLittleEndianInteger(buffer);
+ return Utilities.toEightLittleEndianLong(buffer);
}
}
// TODO: Remove the parameter buffer
- public int readSixteenLittleEndianBytesValue(final byte[] buffer)
+ public long readSixteenLittleEndianBytesLong(final byte[] buffer)
throws IOException {
if (buffer.length < 16) {
throw new OutOfMemoryError("buffer too small");
} else {
this.mRAFReader.read(buffer, 0, 16);
- return Utilities.toSixteenLittleEndianInteger(buffer);
+ return Utilities.toSixteenLittleEndianLong(buffer);
}
}
// Reads characters from the current position into a String and stops when a
// '\0' is encountered
- public String readString() throws IOException {
+ public String readZeroTerminatedString() throws IOException {
final StringBuffer sb = new StringBuffer();
/*
* int i; byte[] buffer = new byte[100]; while (true) {
@@ -92,8 +94,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
* (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i
* != buffer.length) break; } return sb.toString();
*/
- int b;
- b = this.mRAFReader.read();
+ int b = this.mRAFReader.read();
while (b != '\0') {
sb.append((char) b);
b = this.mRAFReader.read();
diff --git a/source/org/openzim/RedirectEntry.java b/source/org/openzim/RedirectEntry.java
deleted file mode 100644
index fdbe3fba1..000000000
--- a/source/org/openzim/RedirectEntry.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2011 Arunesh Mathur
- *
- * This file is a part of zimreader-java.
- *
- * zimreader-java is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License version 3.0 as
- * published by the Free Software Foundation.
- *
- * zimreader-java is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with zimreader-java. If not, see .
- */
-
-package org.openzim;
-
-public class RedirectEntry extends DirectoryEntry {
-
- int redirectIndex;
-
- public RedirectEntry(final int mimeType, final char namespace, final int revision,
- final int redirectIndex, final String url, final String title, final int urlListindex) {
-
- super(mimeType, namespace, revision, url, title, urlListindex);
-
- this.redirectIndex = redirectIndex;
- }
-
- public int getRedirectIndex() {
- return this.redirectIndex;
- }
-
-}
diff --git a/source/org/openzim/Utilities.java b/source/org/openzim/Utilities.java
index 0de337c9c..28572839b 100644
--- a/source/org/openzim/Utilities.java
+++ b/source/org/openzim/Utilities.java
@@ -22,18 +22,21 @@ package org.openzim;
import java.io.IOException;
import java.io.InputStream;
+/**
+ * @author Arunesh Mathur
+ * A ZIM file implementation that stores the Header and the MIMETypeList
+ *
+ * @author Michael Christen
+ * int/long bugfix (did reading of long values with int variables, causing negative offsets)
+ */
public class Utilities {
- // TODO: Write a binary search algorithm
- public static int binarySearch() {
- return -1;
- }
-
public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small");
} else {
- final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
+ final int result =
+ ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
return result;
}
}
@@ -42,39 +45,28 @@ public class Utilities {
if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small");
} else {
- final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
+ final int result =
+ ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
return result;
}
}
- public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException {
+ public static long toEightLittleEndianLong(final byte[] buffer) throws IOException {
if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small");
} else {
- final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
- | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
- | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
- | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56));
+ final long result = // cast to long required otherwise this is again an integer
+ ((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8)
+ | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24)
+ | ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40)
+ | ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56));
return result;
}
}
- public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException {
- if (buffer.length < 16) {
- throw new OutOfMemoryError("buffer too small");
- } else {
- final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
- | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
- | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
- | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)
- | ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72)
- | ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88)
- | ((buffer[12] & 0xFF) << 96)
- | ((buffer[13] & 0xFF) << 104)
- | ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120));
- return result;
- }
+ public static long toSixteenLittleEndianLong(final byte[] buffer) throws IOException {
+ return toEightLittleEndianLong(buffer); // there are no sixten bytes long values
}
public static void skipFully(final InputStream stream, final long bytes) throws IOException {
diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java
index c86119be1..56e84ad17 100644
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@@ -20,46 +20,47 @@ package org.openzim;
import java.io.File;
import java.io.FileNotFoundException;
+import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.List;
/**
* @author Arunesh Mathur
- *
* A ZIM file implementation that stores the Header and the MIMETypeList
*
+ * @author Michael Christen
+ * Proof-Reading, unclustering, refactoring,
+ * naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
+ * change of Exception handling,
+ * extension to more attributes as defined in spec (bugfix for mime type loading)
+ * int/long bugfix (did reading of long values with int variables, causing negative offsets)
*/
public class ZIMFile extends File {
- /**
- *
- */
private static final long serialVersionUID = 1L;
- private Header mHeader;
-
- private List mMIMETypeList; // Can be removed if not needed
-
- public ZIMFile(final String path) {
+ // Header values
+ public final int header_magicNumber;
+ public final int header_majorVersion;
+ public final int header_minorVersion;
+ public final long header_uuid;
+ public final int header_entryCount;
+ public final int header_clusterCount;
+ public final long header_urlPtrPos;
+ public final long header_titlePtrPos;
+ public final long header_clusterPtrPos;
+ public final long header_mimeListPos;
+ public final int header_mainPage;
+ public final int header_layoutPage;
+ public final long header_checksumPos;
+
+ // content cache
+ public final List mimeList;
+
+ public ZIMFile(final String path) throws IOException {
super(path);
- try {
- readHeader();
- } catch (final FileNotFoundException e) {
- e.printStackTrace();
- }
- }
-
- private void readHeader() throws FileNotFoundException {
-
- // Helpers
- int len = 0;
- StringBuffer mimeBuffer = null;
-
- // The byte[] that will help us in reading bytes out of the file
- final byte[] buffer = new byte[16];
-
// Check whether the file exists
if (!(this.exists())) {
throw new FileNotFoundException(
@@ -67,132 +68,45 @@ public class ZIMFile extends File {
}
// The reader that will be used to read contents from the file
-
- final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(
- new RandomAccessFile(this, "r"));
-
- // The ZIM file header
- this.mHeader = new Header();
+ final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this, "r"));
+ final byte[] buffer = new byte[16];
// Read the contents of the header
- try {
- this.mHeader.magicNumber = reader.readFourLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.magicNumber);
-
- this.mHeader.version = reader.readFourLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.version);
-
- this.mHeader.uuid = reader.readSixteenLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.uuid); reader.read(buffer, 0, 4);
-
- this.mHeader.articleCount = reader
- .readFourLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.articleCount);
-
- this.mHeader.clusterCount = reader
- .readFourLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.clusterCount);
-
- this.mHeader.urlPtrPos = reader.readEightLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.urlPtrPos);
-
- this.mHeader.titlePtrPos = reader
- .readEightLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.titlePtrPos);
-
- this.mHeader.clusterPtrPos = reader
- .readEightLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.clusterPtrPos);
-
- this.mHeader.mimeListPos = reader
- .readEightLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.mimeListPos);
-
- this.mHeader.mainPage = reader.readFourLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.mainPage);
-
- this.mHeader.layoutPage = reader.readFourLittleEndianBytesValue(buffer);
- // System.out.println(mHeader.layoutPage);
-
- // Initialise the MIMETypeList
- this.mMIMETypeList = new ArrayList<>();
- while (true) {
+ this.header_magicNumber = reader.readFourLittleEndianBytesInt(buffer); // 4
+ this.header_majorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 2
+ this.header_minorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 4
+ this.header_uuid = reader.readSixteenLittleEndianBytesLong(buffer); // 16
+ this.header_entryCount = reader.readFourLittleEndianBytesInt(buffer); // 4
+ this.header_clusterCount = reader.readFourLittleEndianBytesInt(buffer); // 4
+ this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
+ this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
+ this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
+ this.header_mimeListPos = reader.readEightLittleEndianBytesLong(buffer); // 8
+ this.header_mainPage = reader.readFourLittleEndianBytesInt(buffer); // 4
+ this.header_layoutPage = reader.readFourLittleEndianBytesInt(buffer); // 4
+ this.header_checksumPos = reader.readEightLittleEndianBytesLong(buffer); // 8 [FIX!]
+
+ // Initialise the MIMETypeList
+ int len = 0;
+ StringBuffer mimeBuffer = null;
+ this.mimeList = new ArrayList<>();
+ while (true) {
+ reader.read(buffer, 0, 1); // read only one byte to check if this is a zero
+ len = 0;
+ mimeBuffer = new StringBuffer();
+ while (buffer[0] != '\0') {
+ mimeBuffer.append((char) buffer[0]);
reader.read(buffer, 0, 1);
- len = 0;
- mimeBuffer = new StringBuffer();
- while (buffer[0] != '\0') {
- mimeBuffer.append((char) buffer[0]);
- reader.read(buffer, 0, 1);
- len++;
- }
- if (len == 0) {
- break;
- }
- this.mMIMETypeList.add(mimeBuffer.toString());
- // System.out.println(mimeBuffer);
+ len++;
}
-
- } catch (final Exception e) {
- e.printStackTrace();
+ if (len == 0) {
+ break;
+ }
+ String mimeType = mimeBuffer.toString();
+ System.out.println(mimeType);
+ this.mimeList.add(mimeType);
}
- }
-
- public int getVersion() {
- return this.mHeader.version;
- }
-
- public int getUuid() {
- return this.mHeader.uuid;
- }
-
- public int getArticleCount() {
- return this.mHeader.articleCount;
- }
-
- public int getClusterCount() {
- return this.mHeader.clusterCount;
- }
-
- public int getUrlPtrPos() {
- return this.mHeader.urlPtrPos;
- }
-
- public int getTitlePtrPos() {
- return this.mHeader.titlePtrPos;
- }
-
- public int getClusterPtrPos() {
- return this.mHeader.clusterPtrPos;
- }
-
- public String getMIMEType(final int mimeNumber) {
- return this.mMIMETypeList.get(mimeNumber);
- }
-
- public int getHeaderSize() {
- return this.mHeader.mimeListPos;
- }
-
- public int getMainPage() {
- return this.mHeader.mainPage;
- }
-
- public int getLayoutPage() {
- return this.mHeader.layoutPage;
- }
- public class Header {
- int magicNumber;
- int version;
- int uuid;
- int articleCount;
- int clusterCount;
- int urlPtrPos;
- int titlePtrPos;
- int clusterPtrPos;
- int mimeListPos;
- int mainPage;
- int layoutPage;
}
}
diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java
index affd6ea6d..49d25c50c 100644
--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@@ -29,28 +29,91 @@ import org.tukaani.xz.SingleXZInputStream;
/**
* @author Arunesh Mathur
- *
* A ZIMReader that reads data from the ZIMFile
*
+ * @author Michael Christen
+ * Proof-Reading, unclustering, refactoring,
+ * naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
+ * change of Exception handling,
+ * extension to more attributes as defined in spec (bugfix for mime type loading)
+ * bugfix to long parsing (prevented reading of large files)
*/
public class ZIMReader {
private final ZIMFile mFile;
private RandomAcessFileZIMInputStream mReader;
+ public static abstract class DirectoryEntry {
+
+ public final int mimetype;
+ public final char namespace;
+ public final int cluster_number;
+ public final String url;
+ public final String title;
+ public final long urlListindex;
+
+ public DirectoryEntry(
+ final int mimeType, final char namespace,
+ final int cluster_number,
+ final String url, final String title,
+ final long index) {
+ this.mimetype = mimeType;
+ this.namespace = namespace;
+ this.cluster_number = cluster_number;
+ this.url = url;
+ this.title = title;
+ this.urlListindex = index;
+ }
+
+ }
+
+ public static class ArticleEntry extends DirectoryEntry {
+
+ public final int cluster_number;
+ public final int blob_number;
+
+ public ArticleEntry(
+ final int mimeType, final char namespace,
+ final int cluster_number, final int blob_number,
+ final String url, final String title,
+ final long urlListindex) {
+ super(mimeType, namespace, cluster_number, url, title, urlListindex);
+ this.cluster_number = cluster_number;
+ this.blob_number = blob_number;
+ }
+
+ }
+
+ public static class RedirectEntry extends DirectoryEntry {
+
+ public final long redirect_index;
+
+ public RedirectEntry(final int mimeType, final char namespace,
+ final long redirect_index, final String url, final String title,
+ final long urlListindex) {
+ super(mimeType, namespace, 0, url, title, urlListindex);
+ this.redirect_index = redirect_index;
+ }
+
+ }
+
public ZIMReader(final ZIMFile file) {
this.mFile = file;
try {
- this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(
- this.mFile, "r"));
+ this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this.mFile, "r"));
} catch (final FileNotFoundException e) {
e.printStackTrace();
}
}
+ public ZIMFile getZIMFile() {
+ return this.mFile;
+ }
+
+ // get a URL list that is sorted by the urls
public List getURLListByURL() throws IOException {
- int i = 0, pos, mimeType;
+ int i = 0, mimeType;
final byte[] buffer = new byte[8];
@@ -58,12 +121,12 @@ public class ZIMReader {
final ArrayList returnList = new ArrayList<>();
// Move to the spot where URL's are listed
- this.mReader.seek(this.mFile.getUrlPtrPos());
+ this.mReader.seek(this.mFile.header_urlPtrPos);
- for (i = 0; i < this.mFile.getArticleCount(); i++) {
+ for (i = 0; i < this.mFile.header_entryCount; i++) {
// The position of URL i
- pos = this.mReader.readEightLittleEndianBytesValue(buffer);
+ long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
// Mark the current position that we need to return to
this.mReader.mark();
@@ -72,14 +135,14 @@ public class ZIMReader {
this.mReader.seek(pos);
// Article or Redirect entry?
- mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
+ mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
if (mimeType == 65535) {
this.mReader.seek(pos + 12);
- returnList.add(this.mReader.readString());
+ returnList.add(this.mReader.readZeroTerminatedString());
} else {
this.mReader.seek(pos + 16);
- returnList.add(this.mReader.readString());
+ returnList.add(this.mReader.readZeroTerminatedString());
}
this.mReader.reset();
@@ -88,9 +151,10 @@ public class ZIMReader {
return returnList;
}
+ // get a URL list that is sorted by the entry titles
public List getURLListByTitle() throws IOException {
- int i = 0, pos, mimeType, articleNumber, urlPtrPos;
+ int i = 0, mimeType, articleNumber;
final byte[] buffer = new byte[8];
@@ -98,35 +162,35 @@ public class ZIMReader {
final ArrayList returnList = new ArrayList<>();
// Get the UrlPtrPos or one time storage
- urlPtrPos = this.mFile.getUrlPtrPos();
+ long urlPtrPos = this.mFile.header_urlPtrPos;
// Move to the spot where URL's are listed
- this.mReader.seek(this.mFile.getTitlePtrPos());
+ this.mReader.seek(this.mFile.header_titlePtrPos);
- for (i = 0; i < this.mFile.getArticleCount(); i++) {
+ for (i = 0; i < this.mFile.header_entryCount; i++) {
// The articleNumber of the position of URL i
- articleNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
+ articleNumber = this.mReader.readFourLittleEndianBytesInt(buffer);
// Mark the current position that we need to return to
this.mReader.mark();
- this.mReader.seek(urlPtrPos + (8 * (articleNumber)));
+ this.mReader.seek(urlPtrPos + (8L * (articleNumber)));
// The position of URL i
- pos = this.mReader.readEightLittleEndianBytesValue(buffer);
+ long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
this.mReader.seek(pos);
// Article or Redirect entry?
- mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
+ mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
if (mimeType == 65535) {
this.mReader.seek(pos + 12);
- final String url = this.mReader.readString();
+ final String url = this.mReader.readZeroTerminatedString();
returnList.add(url);
} else {
this.mReader.seek(pos + 16);
- final String url = this.mReader.readString();
+ final String url = this.mReader.readZeroTerminatedString();
returnList.add(url);
}
@@ -137,14 +201,69 @@ public class ZIMReader {
return returnList;
}
+ // position must be the seek position for the title in the Title Pointer List
+ private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException {
+
+ // Helpers
+ final byte[] buffer = new byte[8];
+
+ // At the appropriate position in the titlePtrPos
+ this.mReader.seek(position);
+
+ // Get value of article at index
+ int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(buffer);
+
+ // Move to the position in urlPtrPos
+ this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer);
+
+ // Get value of article in urlPtrPos
+ long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(buffer);
+
+ // Go to the location of the directory entry
+ this.mReader.seek(pointer_to_the_directory_entry);
+
+ // read the Content Entry
+ final int type = this.mReader.readTwoLittleEndianBytesInt(buffer); // 2, 0xffff for redirect
+ this.mReader.read(); // 1, ignore, parameter length not used
+ final char namespace = (char) this.mReader.read(); // 1
+ this.mReader.readFourLittleEndianBytesInt(buffer); // 4, ignore, revision not used
+
+ // Article or Redirect entry
+ if (type == 65535) {
+ final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(buffer);
+ final String url = this.mReader.readZeroTerminatedString();
+ String title = this.mReader.readZeroTerminatedString();
+ title = title.equals("") ? url : title;
+ return new RedirectEntry(type, namespace, redirectIndex,
+ url, title, (position - this.mFile.header_urlPtrPos) / 8);
+ } else {
+ final int cluster_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4
+ final int blob_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4
+ final String url = this.mReader.readZeroTerminatedString(); // zero terminated
+ String title = this.mReader.readZeroTerminatedString(); // zero terminated
+ title = title.equals("") ? url : title;
+
+ return new ArticleEntry(
+ type, namespace,
+ cluster_number, blob_number,
+ url, title, (position - this.mFile.header_urlPtrPos) / 8);
+ }
+
+ }
+
+ public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException {
+ if (entryNumber >= this.mFile.header_entryCount) throw new IOException("entryNumber exceeds entryCount");
+ return getDirectoryInfoAtTitlePosition(this.mFile.header_titlePtrPos + 4 * entryNumber);
+ }
+
// Gives the minimum required information needed for the given articleName
- public DirectoryEntry getDirectoryInfo(String articleName, final char namespace)
- throws IOException {
+ // This makes a binary search on the article name entry list.
+ public DirectoryEntry getDirectoryInfo(final char namespace, String articleName) throws IOException {
DirectoryEntry entry;
String cmpStr;
- final int numberOfArticles = this.mFile.getArticleCount();
- int beg = this.mFile.getTitlePtrPos(), end = beg + (numberOfArticles * 4), mid;
+ final int numberOfArticles = this.mFile.header_entryCount;
+ long beg = this.mFile.header_titlePtrPos, end = beg + (numberOfArticles * 4), mid;
articleName = namespace + "/" + articleName;
@@ -154,7 +273,7 @@ public class ZIMReader {
if (entry == null) {
return null;
}
- cmpStr = entry.getNamespace() + "/" + entry.getUrl();
+ cmpStr = entry.namespace + "/" + entry.url;
if (articleName.compareTo(cmpStr) < 0) {
end = mid - 4;
@@ -167,242 +286,130 @@ public class ZIMReader {
}
return null;
-
}
- public ByteArrayOutputStream getArticleData(final String articleName, final char namespace) throws IOException {
-
- // search in the cache first, if not found, then call getDirectoryInfo(articleName)
-
- byte[] buffer = new byte[8];
-
- final DirectoryEntry mainEntry = getDirectoryInfo(articleName, namespace);
-
- if (mainEntry != null) {
-
- // Check what kind of an entry was mainEnrty
- if (mainEntry.getClass() == ArticleEntry.class) {
-
- // Cast to ArticleEntry
- final ArticleEntry article = (ArticleEntry) mainEntry;
-
- // Get the cluster and blob numbers from the article
- final int clusterNumber = article.getClusterNumber();
- final int blobNumber = article.getBlobnumber();
-
- // Move to the cluster entry in the clusterPtrPos
- this.mReader.seek(this.mFile.getClusterPtrPos() + clusterNumber * 8);
-
- // Read the location of the cluster
- final int clusterPos = this.mReader
- .readEightLittleEndianBytesValue(buffer);
-
- // Move to the cluster
- this.mReader.seek(clusterPos);
+ public ByteArrayOutputStream getArticleData(final DirectoryEntry directoryInfo) throws IOException {
- // Read the first byte, for compression information
- final int compressionType = this.mReader.read();
+ // fail fast
+ if (directoryInfo == null) return null;
+ if (directoryInfo.getClass() != ArticleEntry.class) return null;
- // Reference declaration
- SingleXZInputStream xzReader = null;
- int firstOffset, numberOfBlobs, offset1,
- offset2,
- location,
- differenceOffset;
+ // This is now an article, so thus we can cast to ArticleEntry
+ final ArticleEntry article = (ArticleEntry) directoryInfo;
- ByteArrayOutputStream baos;
+ // Move to the cluster entry in the clusterPtrPos
+ this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8);
- // Check the compression type that was read
- switch (compressionType) {
-
- // TODO: Read uncompressed data directly
- case 0:
- case 1:
-
- // Read the first 4 bytes to find out the number of artciles
- buffer = new byte[4];
-
- // Create a dictionary with size 40MiB, the zimlib uses this
- // size while creating
-
- // Read the first offset
- this.mReader.read(buffer);
-
- // The first four bytes are the offset of the zeroth blob
- firstOffset = Utilities
- .toFourLittleEndianInteger(buffer);
-
- // The number of blobs
- numberOfBlobs = firstOffset / 4;
-
- // The blobNumber has to be lesser than the numberOfBlobs
- assert blobNumber < numberOfBlobs;
-
-
- if (blobNumber == 0) {
- // The first offset is what we read earlier
- offset1 = firstOffset;
- } else {
-
- location = (blobNumber - 1) * 4;
- Utilities.skipFully(this.mReader, location);
- this.mReader.read(buffer);
- offset1 = Utilities.toFourLittleEndianInteger(buffer);
- }
-
- this.mReader.read(buffer);
- offset2 = Utilities.toFourLittleEndianInteger(buffer);
-
- differenceOffset = offset2 - offset1;
- buffer = new byte[differenceOffset];
-
- Utilities.skipFully(this.mReader,
- (offset1 - 4 * (blobNumber + 2)));
-
- this.mReader.read(buffer, 0, differenceOffset);
-
- baos = new ByteArrayOutputStream();
- baos.write(buffer, 0, differenceOffset);
-
- return baos;
-
- // LZMA2 compressed data
- case 4:
-
- // Read the first 4 bytes to find out the number of artciles
- buffer = new byte[4];
-
- // Create a dictionary with size 40MiB, the zimlib uses this
- // size while creating
- xzReader = new SingleXZInputStream(this.mReader, 4194304);
-
- // Read the first offset
- xzReader.read(buffer);
+ // Read the location of the cluster
+ byte[] buffer = new byte[8];
+ final long clusterPos = this.mReader.readEightLittleEndianBytesLong(buffer);
- // The first four bytes are the offset of the zeroth blob
- firstOffset = Utilities
- .toFourLittleEndianInteger(buffer);
+ // Move to the cluster
+ this.mReader.seek(clusterPos);
- // The number of blobs
- numberOfBlobs = firstOffset / 4;
+ // Read the first byte, for compression information
+ final int compressionType = this.mReader.read();
- // The blobNumber has to be lesser than the numberOfBlobs
- assert blobNumber < numberOfBlobs;
+ // Reference declaration
+ SingleXZInputStream xzReader = null;
+ int firstOffset, numberOfBlobs, offset1,
+ offset2,
+ location,
+ differenceOffset;
- if(blobNumber == 0) {
- // The first offset is what we read earlier
- offset1 = firstOffset;
- } else {
+ ByteArrayOutputStream baos;
- location = (blobNumber - 1) * 4;
- Utilities.skipFully(xzReader, location);
- xzReader.read(buffer);
- offset1 = Utilities.toFourLittleEndianInteger(buffer);
- }
+ // Check the compression type that was read
+ switch (compressionType) {
- xzReader.read(buffer);
- offset2 = Utilities.toFourLittleEndianInteger(buffer);
+ // TODO: Read uncompressed data directly
+ case 0:
+ case 1:
- differenceOffset = offset2 - offset1;
- buffer = new byte[differenceOffset];
+ // Read the first 4 bytes to find out the number of artciles
+ buffer = new byte[4];
- Utilities.skipFully(xzReader,
- (offset1 - 4 * (blobNumber + 2)));
+ // Create a dictionary with size 40MiB, the zimlib uses this
+ // size while creating
- xzReader.read(buffer, 0, differenceOffset);
+ // Read the first offset
+ this.mReader.read(buffer);
- baos = new ByteArrayOutputStream();
- baos.write(buffer, 0, differenceOffset);
+ // The first four bytes are the offset of the zeroth blob
+ firstOffset = Utilities.toFourLittleEndianInteger(buffer);
- return baos;
+ // The number of blobs
+ numberOfBlobs = firstOffset / 4;
- }
+ // The blobNumber has to be lesser than the numberOfBlobs
+ assert article.blob_number < numberOfBlobs;
+ if (article.blob_number == 0) {
+ // The first offset is what we read earlier
+ offset1 = firstOffset;
+ } else {
+ location = (article.blob_number - 1) * 4;
+ Utilities.skipFully(this.mReader, location);
+ this.mReader.read(buffer);
+ offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
- }
-
- return null;
-
- }
-
- public DirectoryEntry getDirectoryInfoAtTitlePosition(final int position)
- throws IOException {
-
- // Helpers
- int pos;
- final byte[] buffer = new byte[8];
-
- // At the appropriate position in the titlePtrPos
- this.mReader.seek(position);
-
- // Get value of article at index
- pos = this.mReader.readFourLittleEndianBytesValue(buffer);
-
- // Move to the position in urlPtrPos
- this.mReader.seek(this.mFile.getUrlPtrPos() + 8 * pos);
-
- // Get value of article in urlPtrPos
- pos = this.mReader.readEightLittleEndianBytesValue(buffer);
-
- // Go to the location of the directory entry
- this.mReader.seek(pos);
-
- final int type = this.mReader.readTwoLittleEndianBytesValue(buffer);
-
- // Ignore the parameter length
- this.mReader.read();
-
- final char namespace = (char) this.mReader.read();
- // System.out.println("Namepsace: " + namespace);
- final int revision = this.mReader.readFourLittleEndianBytesValue(buffer);
- // System.out.println("Revision: " + revision);
+ this.mReader.read(buffer);
+ offset2 = Utilities.toFourLittleEndianInteger(buffer);
+ differenceOffset = offset2 - offset1;
+ buffer = new byte[differenceOffset];
+ Utilities.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
+ this.mReader.read(buffer, 0, differenceOffset);
+ baos = new ByteArrayOutputStream();
+ baos.write(buffer, 0, differenceOffset);
- // TODO: Remove redundant if condition code
- // Article or Redirect entry
- if (type == 65535) {
-
- // System.out.println("MIMEType: " + type);
-
- final int redirectIndex = this.mReader.readFourLittleEndianBytesValue(buffer);
- // System.out.println("RedirectIndex: " + redirectIndex);
+ return baos;
- final String url = this.mReader.readString();
- // System.out.println("URL: " + url);
+ // 2 for zlib and 3 for bzip2 (removed)
- String title = this.mReader.readString();
- title = title.equals("") ? url : title;
- // System.out.println("Title: " + title);
-
- return new RedirectEntry(type, namespace, revision, redirectIndex,
- url, title, (position - this.mFile.getUrlPtrPos()) / 8);
+ // LZMA2 compressed data
+ case 4:
- } else {
+ // Read the first 4 bytes to find out the number of artciles
+ buffer = new byte[4];
- // System.out.println("MIMEType: " + mFile.getMIMEType(type));
+ // Create a dictionary with size 40MiB, the zimlib uses this size while creating
+ xzReader = new SingleXZInputStream(this.mReader, 4194304);
- final int clusterNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
- // System.out.println("Cluster Number: " + clusterNumber);
+ // Read the first offset
+ xzReader.read(buffer);
- final int blobNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
- // System.out.println("Blob Number: " + blobNumber);
+ // The first four bytes are the offset of the zeroth blob
+ firstOffset = Utilities.toFourLittleEndianInteger(buffer);
- final String url = this.mReader.readString();
- // System.out.println("URL: " + url);
+ // The number of blobs
+ numberOfBlobs = firstOffset / 4;
- String title = this.mReader.readString();
- title = title.equals("") ? url : title;
- // System.out.println("Title: " + title);
-
- // Parameter data ignored
+ // The blobNumber has to be lesser than the numberOfBlobs
+ assert article.blob_number < numberOfBlobs;
+ if (article.blob_number == 0) {
+ // The first offset is what we read earlier
+ offset1 = firstOffset;
+ } else {
+ location = (article.blob_number - 1) * 4;
+ Utilities.skipFully(xzReader, location);
+ xzReader.read(buffer);
+ offset1 = Utilities.toFourLittleEndianInteger(buffer);
+ }
- return new ArticleEntry(type, namespace, revision, clusterNumber,
- blobNumber, url, title,
- (position - this.mFile.getUrlPtrPos()) / 8);
+ xzReader.read(buffer);
+ offset2 = Utilities.toFourLittleEndianInteger(buffer);
+ differenceOffset = offset2 - offset1;
+ buffer = new byte[differenceOffset];
+ Utilities.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
+ xzReader.read(buffer, 0, differenceOffset);
+ baos = new ByteArrayOutputStream();
+ baos.write(buffer, 0, differenceOffset);
+ return baos;
+
+ // case 5: zstd compressed (missing!)
+ default:
+ return null;
}
-
}
- public ZIMFile getZIMFile() {
- return this.mFile;
- }
}
diff --git a/source/org/openzim/ZIMTest.java b/source/org/openzim/ZIMTest.java
index 6d8ed64fb..24b9cf9be 100644
--- a/source/org/openzim/ZIMTest.java
+++ b/source/org/openzim/ZIMTest.java
@@ -18,27 +18,49 @@
package org.openzim;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.openzim.ZIMReader.DirectoryEntry;
public class ZIMTest {
+
public static void main(final String[] args) {
- if(args.length!=2) {
- System.out.println("Usage: java ZIMTest ");
+ if(args.length!=1) {
+ System.out.println("Usage: java ZIMTest ");
System.exit(0);
}
- // args[0] is the Zim File's location
- final ZIMFile file = new ZIMFile(args[0]);
+ try {
+ // args[0] is the Zim File's location
+ final ZIMFile file = new ZIMFile(args[0]);
+
+ // Associate the Zim File with a Reader
+ final ZIMReader zReader = new ZIMReader(file);
- // Associate the Zim File with a Reader
- final ZIMReader zReader = new ZIMReader(file);
+ // print a list of urls and titles
+ final List urls = zReader.getURLListByURL();
+ final List titles = zReader.getURLListByTitle();
+ int c = Math.min(10, titles.size());
+ for (int i = 0; i < c; i++) {
+ System.out.println("URL by URL " + i + ": " + urls.get(i));
+ System.out.println("URL by Title " + i + ": " + titles.get(i));
+ DirectoryEntry entry = zReader.getDirectoryInfo(i);
+ System.out.println("URL by Pos " + i + ": " + entry.url);
+ System.out.println("Title by Pos " + i + ": " + entry.title);
+ System.out.println("Namespace by Pos " + i + ": " + entry.namespace);
+ }
- try {
- // args[1] is the name of the articles that is
- // to be fetched
- System.out.println(zReader.getArticleData(args[1],'A').toString("utf-8"));
+ // print article c-1
+ DirectoryEntry directory_entry = zReader.getDirectoryInfo(c - 1);
+ ByteArrayOutputStream articleStream = zReader.getArticleData(directory_entry);
+ String article = articleStream == null ? "NULL" : articleStream.toString(StandardCharsets.UTF_8.name());
+ System.out.println(article);
} catch (final IOException e) {
e.printStackTrace();
}
}
+
}