Fixed a large number of problems in the ZIM reader.

This library was not prepared for large data because it was missing long
data types for pointers. I had to modify the code-base in a fundamental
way:
- Proof-Reading,
- unclustering,
- refactoring,
- naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
- change of Exception handling,
- extension to more attributes as defined in spec (bugfix for mime type
loading)
- bugfix to long parsing (prevented reading of large files)
The code is furthermore very inefficient and requires more attention.
However the format is very useful for YaCy as there are numerous data
sources for ZIM-Files.
pull/607/head
Michael Peter Christen 1 year ago
parent 5ba5fb5d23
commit c2b6b6e7b9

@ -1,46 +0,0 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openzim;
public class ArticleEntry extends DirectoryEntry {
int clusterNumber;
int blobnumber;
public ArticleEntry(final int mimeType, final char namespace, final int revision,
final int clusterNumber, final int blobNumber, final String url, final String title,
final int urlListindex) {
super(mimeType, namespace, revision, url, title, urlListindex);
this.clusterNumber = clusterNumber;
this.blobnumber = blobNumber;
}
public int getClusterNumber() {
return this.clusterNumber;
}
public int getBlobnumber() {
return this.blobnumber;
}
}

@ -1,69 +0,0 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openzim;
public abstract class DirectoryEntry {
int mimeType;
char namespace;
int revision;
String url;
String title;
int urlListindex;
public DirectoryEntry(final int mimeType, final char namespace, final int revision,
final String url, final String title, final int index) {
this.mimeType = mimeType;
this.namespace = namespace;
this.revision = revision;
this.url = url;
this.title = title;
this.urlListindex = index;
}
public int getMimeType() {
return this.mimeType;
}
public char getNamespace() {
return this.namespace;
}
public int getRevision() {
return this.revision;
}
public String getUrl() {
return this.url;
}
public String getTitle() {
return this.title;
}
public int getUrlListindex() {
return this.urlListindex;
}
}

@ -28,6 +28,8 @@ import java.io.RandomAccessFile;
* implementation, can be improved. * implementation, can be improved.
* *
* @author Arunesh Mathur <aruneshmathur1990 at gmail.com> * @author Arunesh Mathur <aruneshmathur1990 at gmail.com>
* @author Michael Christen
* bugfix to long parsing (return value was int)
*/ */
public class RandomAcessFileZIMInputStream extends InputStream { public class RandomAcessFileZIMInputStream extends InputStream {
@ -41,7 +43,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
} }
// TODO: Remove the parameter buffer // TODO: Remove the parameter buffer
public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException { public int readTwoLittleEndianBytesInt(final byte[] buffer) throws IOException {
if (buffer.length < 2) { if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small"); throw new OutOfMemoryError("buffer too small");
} else { } else {
@ -51,7 +53,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
} }
// TODO: Remove the parameter buffer // TODO: Remove the parameter buffer
public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException { public int readFourLittleEndianBytesInt(final byte[] buffer) throws IOException {
if (buffer.length < 4) { if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small"); throw new OutOfMemoryError("buffer too small");
} else { } else {
@ -61,30 +63,30 @@ public class RandomAcessFileZIMInputStream extends InputStream {
} }
// TODO: Remove the parameter buffer // TODO: Remove the parameter buffer
public int readEightLittleEndianBytesValue(final byte[] buffer) public long readEightLittleEndianBytesLong(final byte[] buffer)
throws IOException { throws IOException {
if (buffer.length < 8) { if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small"); throw new OutOfMemoryError("buffer too small");
} else { } else {
this.mRAFReader.read(buffer, 0, 8); this.mRAFReader.read(buffer, 0, 8);
return Utilities.toEightLittleEndianInteger(buffer); return Utilities.toEightLittleEndianLong(buffer);
} }
} }
// TODO: Remove the parameter buffer // TODO: Remove the parameter buffer
public int readSixteenLittleEndianBytesValue(final byte[] buffer) public long readSixteenLittleEndianBytesLong(final byte[] buffer)
throws IOException { throws IOException {
if (buffer.length < 16) { if (buffer.length < 16) {
throw new OutOfMemoryError("buffer too small"); throw new OutOfMemoryError("buffer too small");
} else { } else {
this.mRAFReader.read(buffer, 0, 16); this.mRAFReader.read(buffer, 0, 16);
return Utilities.toSixteenLittleEndianInteger(buffer); return Utilities.toSixteenLittleEndianLong(buffer);
} }
} }
// Reads characters from the current position into a String and stops when a // Reads characters from the current position into a String and stops when a
// '\0' is encountered // '\0' is encountered
public String readString() throws IOException { public String readZeroTerminatedString() throws IOException {
final StringBuffer sb = new StringBuffer(); final StringBuffer sb = new StringBuffer();
/* /*
* int i; byte[] buffer = new byte[100]; while (true) { * int i; byte[] buffer = new byte[100]; while (true) {
@ -92,8 +94,7 @@ public class RandomAcessFileZIMInputStream extends InputStream {
* (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i * (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i
* != buffer.length) break; } return sb.toString(); * != buffer.length) break; } return sb.toString();
*/ */
int b; int b = this.mRAFReader.read();
b = this.mRAFReader.read();
while (b != '\0') { while (b != '\0') {
sb.append((char) b); sb.append((char) b);
b = this.mRAFReader.read(); b = this.mRAFReader.read();

@ -1,37 +0,0 @@
/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openzim;
public class RedirectEntry extends DirectoryEntry {
int redirectIndex;
public RedirectEntry(final int mimeType, final char namespace, final int revision,
final int redirectIndex, final String url, final String title, final int urlListindex) {
super(mimeType, namespace, revision, url, title, urlListindex);
this.redirectIndex = redirectIndex;
}
public int getRedirectIndex() {
return this.redirectIndex;
}
}

@ -22,18 +22,21 @@ package org.openzim;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
/**
* @author Arunesh Mathur
* A ZIM file implementation that stores the Header and the MIMETypeList
*
* @author Michael Christen
* int/long bugfix (did reading of long values with int variables, causing negative offsets)
*/
public class Utilities { public class Utilities {
// TODO: Write a binary search algorithm
public static int binarySearch() {
return -1;
}
public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException { public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException {
if (buffer.length < 2) { if (buffer.length < 2) {
throw new OutOfMemoryError("buffer too small"); throw new OutOfMemoryError("buffer too small");
} else { } else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)); final int result =
((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
return result; return result;
} }
} }
@ -42,39 +45,28 @@ public class Utilities {
if (buffer.length < 4) { if (buffer.length < 4) {
throw new OutOfMemoryError("buffer too small"); throw new OutOfMemoryError("buffer too small");
} else { } else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) final int result =
((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)); | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
return result; return result;
} }
} }
public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException { public static long toEightLittleEndianLong(final byte[] buffer) throws IOException {
if (buffer.length < 8) { if (buffer.length < 8) {
throw new OutOfMemoryError("buffer too small"); throw new OutOfMemoryError("buffer too small");
} else { } else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) final long result = // cast to long required otherwise this is again an integer
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24) ((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8)
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40) | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24)
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)); | ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40)
| ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56));
return result; return result;
} }
} }
public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException { public static long toSixteenLittleEndianLong(final byte[] buffer) throws IOException {
if (buffer.length < 16) { return toEightLittleEndianLong(buffer); // there are no sixten bytes long values
throw new OutOfMemoryError("buffer too small");
} else {
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)
| ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72)
| ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88)
| ((buffer[12] & 0xFF) << 96)
| ((buffer[13] & 0xFF) << 104)
| ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120));
return result;
}
} }
public static void skipFully(final InputStream stream, final long bytes) throws IOException { public static void skipFully(final InputStream stream, final long bytes) throws IOException {

@ -20,46 +20,47 @@ package org.openzim;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* @author Arunesh Mathur * @author Arunesh Mathur
*
* A ZIM file implementation that stores the Header and the MIMETypeList * A ZIM file implementation that stores the Header and the MIMETypeList
* *
* @author Michael Christen
* Proof-Reading, unclustering, refactoring,
* naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
* change of Exception handling,
* extension to more attributes as defined in spec (bugfix for mime type loading)
* int/long bugfix (did reading of long values with int variables, causing negative offsets)
*/ */
public class ZIMFile extends File { public class ZIMFile extends File {
/**
*
*/
private static final long serialVersionUID = 1L; private static final long serialVersionUID = 1L;
private Header mHeader; // Header values
public final int header_magicNumber;
private List<String> mMIMETypeList; // Can be removed if not needed public final int header_majorVersion;
public final int header_minorVersion;
public ZIMFile(final String path) { public final long header_uuid;
public final int header_entryCount;
public final int header_clusterCount;
public final long header_urlPtrPos;
public final long header_titlePtrPos;
public final long header_clusterPtrPos;
public final long header_mimeListPos;
public final int header_mainPage;
public final int header_layoutPage;
public final long header_checksumPos;
// content cache
public final List<String> mimeList;
public ZIMFile(final String path) throws IOException {
super(path); super(path);
try {
readHeader();
} catch (final FileNotFoundException e) {
e.printStackTrace();
}
}
private void readHeader() throws FileNotFoundException {
// Helpers
int len = 0;
StringBuffer mimeBuffer = null;
// The byte[] that will help us in reading bytes out of the file
final byte[] buffer = new byte[16];
// Check whether the file exists // Check whether the file exists
if (!(this.exists())) { if (!(this.exists())) {
throw new FileNotFoundException( throw new FileNotFoundException(
@ -67,132 +68,45 @@ public class ZIMFile extends File {
} }
// The reader that will be used to read contents from the file // The reader that will be used to read contents from the file
final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this, "r"));
final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream( final byte[] buffer = new byte[16];
new RandomAccessFile(this, "r"));
// The ZIM file header
this.mHeader = new Header();
// Read the contents of the header // Read the contents of the header
try { this.header_magicNumber = reader.readFourLittleEndianBytesInt(buffer); // 4
this.mHeader.magicNumber = reader.readFourLittleEndianBytesValue(buffer); this.header_majorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 2
// System.out.println(mHeader.magicNumber); this.header_minorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 4
this.header_uuid = reader.readSixteenLittleEndianBytesLong(buffer); // 16
this.mHeader.version = reader.readFourLittleEndianBytesValue(buffer); this.header_entryCount = reader.readFourLittleEndianBytesInt(buffer); // 4
// System.out.println(mHeader.version); this.header_clusterCount = reader.readFourLittleEndianBytesInt(buffer); // 4
this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.mHeader.uuid = reader.readSixteenLittleEndianBytesValue(buffer); this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
// System.out.println(mHeader.uuid); reader.read(buffer, 0, 4); this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.header_mimeListPos = reader.readEightLittleEndianBytesLong(buffer); // 8
this.mHeader.articleCount = reader this.header_mainPage = reader.readFourLittleEndianBytesInt(buffer); // 4
.readFourLittleEndianBytesValue(buffer); this.header_layoutPage = reader.readFourLittleEndianBytesInt(buffer); // 4
// System.out.println(mHeader.articleCount); this.header_checksumPos = reader.readEightLittleEndianBytesLong(buffer); // 8 [FIX!]
this.mHeader.clusterCount = reader // Initialise the MIMETypeList
.readFourLittleEndianBytesValue(buffer); int len = 0;
// System.out.println(mHeader.clusterCount); StringBuffer mimeBuffer = null;
this.mimeList = new ArrayList<>();
this.mHeader.urlPtrPos = reader.readEightLittleEndianBytesValue(buffer); while (true) {
// System.out.println(mHeader.urlPtrPos); reader.read(buffer, 0, 1); // read only one byte to check if this is a zero
len = 0;
this.mHeader.titlePtrPos = reader mimeBuffer = new StringBuffer();
.readEightLittleEndianBytesValue(buffer); while (buffer[0] != '\0') {
// System.out.println(mHeader.titlePtrPos); mimeBuffer.append((char) buffer[0]);
this.mHeader.clusterPtrPos = reader
.readEightLittleEndianBytesValue(buffer);
// System.out.println(mHeader.clusterPtrPos);
this.mHeader.mimeListPos = reader
.readEightLittleEndianBytesValue(buffer);
// System.out.println(mHeader.mimeListPos);
this.mHeader.mainPage = reader.readFourLittleEndianBytesValue(buffer);
// System.out.println(mHeader.mainPage);
this.mHeader.layoutPage = reader.readFourLittleEndianBytesValue(buffer);
// System.out.println(mHeader.layoutPage);
// Initialise the MIMETypeList
this.mMIMETypeList = new ArrayList<>();
while (true) {
reader.read(buffer, 0, 1); reader.read(buffer, 0, 1);
len = 0; len++;
mimeBuffer = new StringBuffer();
while (buffer[0] != '\0') {
mimeBuffer.append((char) buffer[0]);
reader.read(buffer, 0, 1);
len++;
}
if (len == 0) {
break;
}
this.mMIMETypeList.add(mimeBuffer.toString());
// System.out.println(mimeBuffer);
} }
if (len == 0) {
} catch (final Exception e) { break;
e.printStackTrace(); }
String mimeType = mimeBuffer.toString();
System.out.println(mimeType);
this.mimeList.add(mimeType);
} }
}
public int getVersion() {
return this.mHeader.version;
}
public int getUuid() {
return this.mHeader.uuid;
}
public int getArticleCount() {
return this.mHeader.articleCount;
}
public int getClusterCount() {
return this.mHeader.clusterCount;
}
public int getUrlPtrPos() {
return this.mHeader.urlPtrPos;
}
public int getTitlePtrPos() {
return this.mHeader.titlePtrPos;
}
public int getClusterPtrPos() {
return this.mHeader.clusterPtrPos;
}
public String getMIMEType(final int mimeNumber) {
return this.mMIMETypeList.get(mimeNumber);
}
public int getHeaderSize() {
return this.mHeader.mimeListPos;
}
public int getMainPage() {
return this.mHeader.mainPage;
}
public int getLayoutPage() {
return this.mHeader.layoutPage;
}
public class Header {
int magicNumber;
int version;
int uuid;
int articleCount;
int clusterCount;
int urlPtrPos;
int titlePtrPos;
int clusterPtrPos;
int mimeListPos;
int mainPage;
int layoutPage;
} }
} }

@ -29,28 +29,91 @@ import org.tukaani.xz.SingleXZInputStream;
/** /**
* @author Arunesh Mathur * @author Arunesh Mathur
*
* A ZIMReader that reads data from the ZIMFile * A ZIMReader that reads data from the ZIMFile
* *
* @author Michael Christen
* Proof-Reading, unclustering, refactoring,
* naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
* change of Exception handling,
* extension to more attributes as defined in spec (bugfix for mime type loading)
* bugfix to long parsing (prevented reading of large files)
*/ */
public class ZIMReader { public class ZIMReader {
private final ZIMFile mFile; private final ZIMFile mFile;
private RandomAcessFileZIMInputStream mReader; private RandomAcessFileZIMInputStream mReader;
public static abstract class DirectoryEntry {
public final int mimetype;
public final char namespace;
public final int cluster_number;
public final String url;
public final String title;
public final long urlListindex;
public DirectoryEntry(
final int mimeType, final char namespace,
final int cluster_number,
final String url, final String title,
final long index) {
this.mimetype = mimeType;
this.namespace = namespace;
this.cluster_number = cluster_number;
this.url = url;
this.title = title;
this.urlListindex = index;
}
}
public static class ArticleEntry extends DirectoryEntry {
public final int cluster_number;
public final int blob_number;
public ArticleEntry(
final int mimeType, final char namespace,
final int cluster_number, final int blob_number,
final String url, final String title,
final long urlListindex) {
super(mimeType, namespace, cluster_number, url, title, urlListindex);
this.cluster_number = cluster_number;
this.blob_number = blob_number;
}
}
public static class RedirectEntry extends DirectoryEntry {
public final long redirect_index;
public RedirectEntry(final int mimeType, final char namespace,
final long redirect_index, final String url, final String title,
final long urlListindex) {
super(mimeType, namespace, 0, url, title, urlListindex);
this.redirect_index = redirect_index;
}
}
public ZIMReader(final ZIMFile file) { public ZIMReader(final ZIMFile file) {
this.mFile = file; this.mFile = file;
try { try {
this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile( this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this.mFile, "r"));
this.mFile, "r"));
} catch (final FileNotFoundException e) { } catch (final FileNotFoundException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
public ZIMFile getZIMFile() {
return this.mFile;
}
// get a URL list that is sorted by the urls
public List<String> getURLListByURL() throws IOException { public List<String> getURLListByURL() throws IOException {
int i = 0, pos, mimeType; int i = 0, mimeType;
final byte[] buffer = new byte[8]; final byte[] buffer = new byte[8];
@ -58,12 +121,12 @@ public class ZIMReader {
final ArrayList<String> returnList = new ArrayList<>(); final ArrayList<String> returnList = new ArrayList<>();
// Move to the spot where URL's are listed // Move to the spot where URL's are listed
this.mReader.seek(this.mFile.getUrlPtrPos()); this.mReader.seek(this.mFile.header_urlPtrPos);
for (i = 0; i < this.mFile.getArticleCount(); i++) { for (i = 0; i < this.mFile.header_entryCount; i++) {
// The position of URL i // The position of URL i
pos = this.mReader.readEightLittleEndianBytesValue(buffer); long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
// Mark the current position that we need to return to // Mark the current position that we need to return to
this.mReader.mark(); this.mReader.mark();
@ -72,14 +135,14 @@ public class ZIMReader {
this.mReader.seek(pos); this.mReader.seek(pos);
// Article or Redirect entry? // Article or Redirect entry?
mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer); mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
if (mimeType == 65535) { if (mimeType == 65535) {
this.mReader.seek(pos + 12); this.mReader.seek(pos + 12);
returnList.add(this.mReader.readString()); returnList.add(this.mReader.readZeroTerminatedString());
} else { } else {
this.mReader.seek(pos + 16); this.mReader.seek(pos + 16);
returnList.add(this.mReader.readString()); returnList.add(this.mReader.readZeroTerminatedString());
} }
this.mReader.reset(); this.mReader.reset();
@ -88,9 +151,10 @@ public class ZIMReader {
return returnList; return returnList;
} }
// get a URL list that is sorted by the entry titles
public List<String> getURLListByTitle() throws IOException { public List<String> getURLListByTitle() throws IOException {
int i = 0, pos, mimeType, articleNumber, urlPtrPos; int i = 0, mimeType, articleNumber;
final byte[] buffer = new byte[8]; final byte[] buffer = new byte[8];
@ -98,35 +162,35 @@ public class ZIMReader {
final ArrayList<String> returnList = new ArrayList<>(); final ArrayList<String> returnList = new ArrayList<>();
// Get the UrlPtrPos or one time storage // Get the UrlPtrPos or one time storage
urlPtrPos = this.mFile.getUrlPtrPos(); long urlPtrPos = this.mFile.header_urlPtrPos;
// Move to the spot where URL's are listed // Move to the spot where URL's are listed
this.mReader.seek(this.mFile.getTitlePtrPos()); this.mReader.seek(this.mFile.header_titlePtrPos);
for (i = 0; i < this.mFile.getArticleCount(); i++) { for (i = 0; i < this.mFile.header_entryCount; i++) {
// The articleNumber of the position of URL i // The articleNumber of the position of URL i
articleNumber = this.mReader.readFourLittleEndianBytesValue(buffer); articleNumber = this.mReader.readFourLittleEndianBytesInt(buffer);
// Mark the current position that we need to return to // Mark the current position that we need to return to
this.mReader.mark(); this.mReader.mark();
this.mReader.seek(urlPtrPos + (8 * (articleNumber))); this.mReader.seek(urlPtrPos + (8L * (articleNumber)));
// The position of URL i // The position of URL i
pos = this.mReader.readEightLittleEndianBytesValue(buffer); long pos = this.mReader.readEightLittleEndianBytesLong(buffer);
this.mReader.seek(pos); this.mReader.seek(pos);
// Article or Redirect entry? // Article or Redirect entry?
mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer); mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer);
if (mimeType == 65535) { if (mimeType == 65535) {
this.mReader.seek(pos + 12); this.mReader.seek(pos + 12);
final String url = this.mReader.readString(); final String url = this.mReader.readZeroTerminatedString();
returnList.add(url); returnList.add(url);
} else { } else {
this.mReader.seek(pos + 16); this.mReader.seek(pos + 16);
final String url = this.mReader.readString(); final String url = this.mReader.readZeroTerminatedString();
returnList.add(url); returnList.add(url);
} }
@ -137,14 +201,69 @@ public class ZIMReader {
return returnList; return returnList;
} }
// position must be the seek position for the title in the Title Pointer List
private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException {
// Helpers
final byte[] buffer = new byte[8];
// At the appropriate position in the titlePtrPos
this.mReader.seek(position);
// Get value of article at index
int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(buffer);
// Move to the position in urlPtrPos
this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer);
// Get value of article in urlPtrPos
long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(buffer);
// Go to the location of the directory entry
this.mReader.seek(pointer_to_the_directory_entry);
// read the Content Entry
final int type = this.mReader.readTwoLittleEndianBytesInt(buffer); // 2, 0xffff for redirect
this.mReader.read(); // 1, ignore, parameter length not used
final char namespace = (char) this.mReader.read(); // 1
this.mReader.readFourLittleEndianBytesInt(buffer); // 4, ignore, revision not used
// Article or Redirect entry
if (type == 65535) {
final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(buffer);
final String url = this.mReader.readZeroTerminatedString();
String title = this.mReader.readZeroTerminatedString();
title = title.equals("") ? url : title;
return new RedirectEntry(type, namespace, redirectIndex,
url, title, (position - this.mFile.header_urlPtrPos) / 8);
} else {
final int cluster_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4
final int blob_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4
final String url = this.mReader.readZeroTerminatedString(); // zero terminated
String title = this.mReader.readZeroTerminatedString(); // zero terminated
title = title.equals("") ? url : title;
return new ArticleEntry(
type, namespace,
cluster_number, blob_number,
url, title, (position - this.mFile.header_urlPtrPos) / 8);
}
}
public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException {
if (entryNumber >= this.mFile.header_entryCount) throw new IOException("entryNumber exceeds entryCount");
return getDirectoryInfoAtTitlePosition(this.mFile.header_titlePtrPos + 4 * entryNumber);
}
// Gives the minimum required information needed for the given articleName // Gives the minimum required information needed for the given articleName
public DirectoryEntry getDirectoryInfo(String articleName, final char namespace) // This makes a binary search on the article name entry list.
throws IOException { public DirectoryEntry getDirectoryInfo(final char namespace, String articleName) throws IOException {
DirectoryEntry entry; DirectoryEntry entry;
String cmpStr; String cmpStr;
final int numberOfArticles = this.mFile.getArticleCount(); final int numberOfArticles = this.mFile.header_entryCount;
int beg = this.mFile.getTitlePtrPos(), end = beg + (numberOfArticles * 4), mid; long beg = this.mFile.header_titlePtrPos, end = beg + (numberOfArticles * 4), mid;
articleName = namespace + "/" + articleName; articleName = namespace + "/" + articleName;
@ -154,7 +273,7 @@ public class ZIMReader {
if (entry == null) { if (entry == null) {
return null; return null;
} }
cmpStr = entry.getNamespace() + "/" + entry.getUrl(); cmpStr = entry.namespace + "/" + entry.url;
if (articleName.compareTo(cmpStr) < 0) { if (articleName.compareTo(cmpStr) < 0) {
end = mid - 4; end = mid - 4;
@ -167,242 +286,130 @@ public class ZIMReader {
} }
return null; return null;
} }
public ByteArrayOutputStream getArticleData(final String articleName, final char namespace) throws IOException { public ByteArrayOutputStream getArticleData(final DirectoryEntry directoryInfo) throws IOException {
// search in the cache first, if not found, then call getDirectoryInfo(articleName)
byte[] buffer = new byte[8];
final DirectoryEntry mainEntry = getDirectoryInfo(articleName, namespace);
if (mainEntry != null) {
// Check what kind of an entry was mainEnrty
if (mainEntry.getClass() == ArticleEntry.class) {
// Cast to ArticleEntry
final ArticleEntry article = (ArticleEntry) mainEntry;
// Get the cluster and blob numbers from the article
final int clusterNumber = article.getClusterNumber();
final int blobNumber = article.getBlobnumber();
// Move to the cluster entry in the clusterPtrPos
this.mReader.seek(this.mFile.getClusterPtrPos() + clusterNumber * 8);
// Read the location of the cluster
final int clusterPos = this.mReader
.readEightLittleEndianBytesValue(buffer);
// Move to the cluster
this.mReader.seek(clusterPos);
// Read the first byte, for compression information // fail fast
final int compressionType = this.mReader.read(); if (directoryInfo == null) return null;
if (directoryInfo.getClass() != ArticleEntry.class) return null;
// Reference declaration // This is now an article, so thus we can cast to ArticleEntry
SingleXZInputStream xzReader = null; final ArticleEntry article = (ArticleEntry) directoryInfo;
int firstOffset, numberOfBlobs, offset1,
offset2,
location,
differenceOffset;
ByteArrayOutputStream baos; // Move to the cluster entry in the clusterPtrPos
this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8);
// Check the compression type that was read // Read the location of the cluster
switch (compressionType) { byte[] buffer = new byte[8];
final long clusterPos = this.mReader.readEightLittleEndianBytesLong(buffer);
// TODO: Read uncompressed data directly
case 0:
case 1:
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// Create a dictionary with size 40MiB, the zimlib uses this
// size while creating
// Read the first offset
this.mReader.read(buffer);
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities
.toFourLittleEndianInteger(buffer);
// The number of blobs
numberOfBlobs = firstOffset / 4;
// The blobNumber has to be lesser than the numberOfBlobs
assert blobNumber < numberOfBlobs;
if (blobNumber == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (blobNumber - 1) * 4;
Utilities.skipFully(this.mReader, location);
this.mReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
this.mReader.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(this.mReader,
(offset1 - 4 * (blobNumber + 2)));
this.mReader.read(buffer, 0, differenceOffset);
baos = new ByteArrayOutputStream();
baos.write(buffer, 0, differenceOffset);
return baos;
// LZMA2 compressed data
case 4:
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// Create a dictionary with size 40MiB, the zimlib uses this
// size while creating
xzReader = new SingleXZInputStream(this.mReader, 4194304);
// Read the first offset
xzReader.read(buffer);
// The first four bytes are the offset of the zeroth blob // Move to the cluster
firstOffset = Utilities this.mReader.seek(clusterPos);
.toFourLittleEndianInteger(buffer);
// The number of blobs // Read the first byte, for compression information
numberOfBlobs = firstOffset / 4; final int compressionType = this.mReader.read();
// The blobNumber has to be lesser than the numberOfBlobs // Reference declaration
assert blobNumber < numberOfBlobs; SingleXZInputStream xzReader = null;
int firstOffset, numberOfBlobs, offset1,
offset2,
location,
differenceOffset;
if(blobNumber == 0) { ByteArrayOutputStream baos;
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (blobNumber - 1) * 4; // Check the compression type that was read
Utilities.skipFully(xzReader, location); switch (compressionType) {
xzReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
xzReader.read(buffer); // TODO: Read uncompressed data directly
offset2 = Utilities.toFourLittleEndianInteger(buffer); case 0:
case 1:
differenceOffset = offset2 - offset1; // Read the first 4 bytes to find out the number of artciles
buffer = new byte[differenceOffset]; buffer = new byte[4];
Utilities.skipFully(xzReader, // Create a dictionary with size 40MiB, the zimlib uses this
(offset1 - 4 * (blobNumber + 2))); // size while creating
xzReader.read(buffer, 0, differenceOffset); // Read the first offset
this.mReader.read(buffer);
baos = new ByteArrayOutputStream(); // The first four bytes are the offset of the zeroth blob
baos.write(buffer, 0, differenceOffset); firstOffset = Utilities.toFourLittleEndianInteger(buffer);
return baos; // The number of blobs
numberOfBlobs = firstOffset / 4;
} // The blobNumber has to be lesser than the numberOfBlobs
assert article.blob_number < numberOfBlobs;
if (article.blob_number == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (article.blob_number - 1) * 4;
Utilities.skipFully(this.mReader, location);
this.mReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
} }
}
return null;
}
public DirectoryEntry getDirectoryInfoAtTitlePosition(final int position)
throws IOException {
// Helpers
int pos;
final byte[] buffer = new byte[8];
// At the appropriate position in the titlePtrPos
this.mReader.seek(position);
// Get value of article at index
pos = this.mReader.readFourLittleEndianBytesValue(buffer);
// Move to the position in urlPtrPos
this.mReader.seek(this.mFile.getUrlPtrPos() + 8 * pos);
// Get value of article in urlPtrPos
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
// Go to the location of the directory entry
this.mReader.seek(pos);
final int type = this.mReader.readTwoLittleEndianBytesValue(buffer);
// Ignore the parameter length
this.mReader.read();
final char namespace = (char) this.mReader.read();
// System.out.println("Namepsace: " + namespace);
final int revision = this.mReader.readFourLittleEndianBytesValue(buffer); this.mReader.read(buffer);
// System.out.println("Revision: " + revision); offset2 = Utilities.toFourLittleEndianInteger(buffer);
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
this.mReader.read(buffer, 0, differenceOffset);
baos = new ByteArrayOutputStream();
baos.write(buffer, 0, differenceOffset);
// TODO: Remove redundant if condition code return baos;
// Article or Redirect entry
if (type == 65535) {
// System.out.println("MIMEType: " + type);
final int redirectIndex = this.mReader.readFourLittleEndianBytesValue(buffer);
// System.out.println("RedirectIndex: " + redirectIndex);
final String url = this.mReader.readString(); // 2 for zlib and 3 for bzip2 (removed)
// System.out.println("URL: " + url);
String title = this.mReader.readString(); // LZMA2 compressed data
title = title.equals("") ? url : title; case 4:
// System.out.println("Title: " + title);
return new RedirectEntry(type, namespace, revision, redirectIndex,
url, title, (position - this.mFile.getUrlPtrPos()) / 8);
} else { // Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// System.out.println("MIMEType: " + mFile.getMIMEType(type)); // Create a dictionary with size 40MiB, the zimlib uses this size while creating
xzReader = new SingleXZInputStream(this.mReader, 4194304);
final int clusterNumber = this.mReader.readFourLittleEndianBytesValue(buffer); // Read the first offset
// System.out.println("Cluster Number: " + clusterNumber); xzReader.read(buffer);
final int blobNumber = this.mReader.readFourLittleEndianBytesValue(buffer); // The first four bytes are the offset of the zeroth blob
// System.out.println("Blob Number: " + blobNumber); firstOffset = Utilities.toFourLittleEndianInteger(buffer);
final String url = this.mReader.readString(); // The number of blobs
// System.out.println("URL: " + url); numberOfBlobs = firstOffset / 4;
String title = this.mReader.readString(); // The blobNumber has to be lesser than the numberOfBlobs
title = title.equals("") ? url : title; assert article.blob_number < numberOfBlobs;
// System.out.println("Title: " + title); if (article.blob_number == 0) {
// The first offset is what we read earlier
// Parameter data ignored offset1 = firstOffset;
} else {
location = (article.blob_number - 1) * 4;
Utilities.skipFully(xzReader, location);
xzReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
return new ArticleEntry(type, namespace, revision, clusterNumber, xzReader.read(buffer);
blobNumber, url, title, offset2 = Utilities.toFourLittleEndianInteger(buffer);
(position - this.mFile.getUrlPtrPos()) / 8); differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
xzReader.read(buffer, 0, differenceOffset);
baos = new ByteArrayOutputStream();
baos.write(buffer, 0, differenceOffset);
return baos;
// case 5: zstd compressed (missing!)
default:
return null;
} }
} }
public ZIMFile getZIMFile() {
return this.mFile;
}
} }

@ -18,27 +18,49 @@
package org.openzim; package org.openzim;
import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.openzim.ZIMReader.DirectoryEntry;
public class ZIMTest { public class ZIMTest {
public static void main(final String[] args) { public static void main(final String[] args) {
if(args.length!=2) { if(args.length!=1) {
System.out.println("Usage: java ZIMTest <ZIM_FILE> <ARTICLE_NAME>"); System.out.println("Usage: java ZIMTest <ZIM_FILE>");
System.exit(0); System.exit(0);
} }
// args[0] is the Zim File's location try {
final ZIMFile file = new ZIMFile(args[0]); // args[0] is the Zim File's location
final ZIMFile file = new ZIMFile(args[0]);
// Associate the Zim File with a Reader
final ZIMReader zReader = new ZIMReader(file);
// Associate the Zim File with a Reader // print a list of urls and titles
final ZIMReader zReader = new ZIMReader(file); final List<String> urls = zReader.getURLListByURL();
final List<String> titles = zReader.getURLListByTitle();
int c = Math.min(10, titles.size());
for (int i = 0; i < c; i++) {
System.out.println("URL by URL " + i + ": " + urls.get(i));
System.out.println("URL by Title " + i + ": " + titles.get(i));
DirectoryEntry entry = zReader.getDirectoryInfo(i);
System.out.println("URL by Pos " + i + ": " + entry.url);
System.out.println("Title by Pos " + i + ": " + entry.title);
System.out.println("Namespace by Pos " + i + ": " + entry.namespace);
}
try { // print article c-1
// args[1] is the name of the articles that is DirectoryEntry directory_entry = zReader.getDirectoryInfo(c - 1);
// to be fetched ByteArrayOutputStream articleStream = zReader.getArticleData(directory_entry);
System.out.println(zReader.getArticleData(args[1],'A').toString("utf-8")); String article = articleStream == null ? "NULL" : articleStream.toString(StandardCharsets.UTF_8.name());
System.out.println(article);
} catch (final IOException e) { } catch (final IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
} }

Loading…
Cancel
Save