From c2b6b6e7b99431cad4dc2fd1a9d0331c80f4619b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 27 Oct 2023 15:49:23 +0200 Subject: [PATCH] Fixed a large number of problems in the ZIM reader. This library was not prepared for large data because it was missing long data types for pointers. I had to modify the code-base in a fundamental way: - Proof-Reading, - unclustering, - refactoring, - naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format, - change of Exception handling, - extension to more attributes as defined in spec (bugfix for mime type loading) - bugfix to long parsing (prevented reading of large files) The code is furthermore very inefficient and requires more attention. However the format is very useful for YaCy as there are numerous data sources for ZIM-Files. --- source/org/openzim/ArticleEntry.java | 46 -- source/org/openzim/DirectoryEntry.java | 69 --- .../RandomAcessFileZIMInputStream.java | 19 +- source/org/openzim/RedirectEntry.java | 37 -- source/org/openzim/Utilities.java | 46 +- source/org/openzim/ZIMFile.java | 204 +++----- source/org/openzim/ZIMReader.java | 467 +++++++++--------- source/org/openzim/ZIMTest.java | 42 +- 8 files changed, 357 insertions(+), 573 deletions(-) delete mode 100644 source/org/openzim/ArticleEntry.java delete mode 100644 source/org/openzim/DirectoryEntry.java delete mode 100644 source/org/openzim/RedirectEntry.java diff --git a/source/org/openzim/ArticleEntry.java b/source/org/openzim/ArticleEntry.java deleted file mode 100644 index 7eeae2e06..000000000 --- a/source/org/openzim/ArticleEntry.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (C) 2011 Arunesh Mathur - * - * This file is a part of zimreader-java. - * - * zimreader-java is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License version 3.0 as - * published by the Free Software Foundation. - * - * zimreader-java is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with zimreader-java. If not, see . - */ - - -package org.openzim; - -public class ArticleEntry extends DirectoryEntry { - - int clusterNumber; - - int blobnumber; - - public ArticleEntry(final int mimeType, final char namespace, final int revision, - final int clusterNumber, final int blobNumber, final String url, final String title, - final int urlListindex) { - - super(mimeType, namespace, revision, url, title, urlListindex); - - this.clusterNumber = clusterNumber; - this.blobnumber = blobNumber; - } - - public int getClusterNumber() { - return this.clusterNumber; - } - - public int getBlobnumber() { - return this.blobnumber; - } - -} diff --git a/source/org/openzim/DirectoryEntry.java b/source/org/openzim/DirectoryEntry.java deleted file mode 100644 index 92c52de41..000000000 --- a/source/org/openzim/DirectoryEntry.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (C) 2011 Arunesh Mathur - * - * This file is a part of zimreader-java. - * - * zimreader-java is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License version 3.0 as - * published by the Free Software Foundation. - * - * zimreader-java is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with zimreader-java. If not, see . - */ - -package org.openzim; - -public abstract class DirectoryEntry { - - int mimeType; - - char namespace; - - int revision; - - String url; - - String title; - - int urlListindex; - - public DirectoryEntry(final int mimeType, final char namespace, final int revision, - final String url, final String title, final int index) { - this.mimeType = mimeType; - this.namespace = namespace; - this.revision = revision; - this.url = url; - this.title = title; - this.urlListindex = index; - } - - public int getMimeType() { - return this.mimeType; - } - - public char getNamespace() { - return this.namespace; - } - - public int getRevision() { - return this.revision; - } - - public String getUrl() { - return this.url; - } - - public String getTitle() { - return this.title; - } - - public int getUrlListindex() { - return this.urlListindex; - } - -} diff --git a/source/org/openzim/RandomAcessFileZIMInputStream.java b/source/org/openzim/RandomAcessFileZIMInputStream.java index 006dd4498..cb6cdb093 100644 --- a/source/org/openzim/RandomAcessFileZIMInputStream.java +++ b/source/org/openzim/RandomAcessFileZIMInputStream.java @@ -28,6 +28,8 @@ import java.io.RandomAccessFile; * implementation, can be improved. * * @author Arunesh Mathur + * @author Michael Christen + * bugfix to long parsing (return value was int) */ public class RandomAcessFileZIMInputStream extends InputStream { @@ -41,7 +43,7 @@ public class RandomAcessFileZIMInputStream extends InputStream { } // TODO: Remove the parameter buffer - public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException { + public int readTwoLittleEndianBytesInt(final byte[] buffer) throws IOException { if (buffer.length < 2) { throw new OutOfMemoryError("buffer too small"); } else { @@ -51,7 +53,7 @@ public class RandomAcessFileZIMInputStream extends InputStream { } // TODO: Remove the parameter buffer - public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException { + public int readFourLittleEndianBytesInt(final byte[] buffer) throws IOException { if (buffer.length < 4) { throw new OutOfMemoryError("buffer too small"); } else { @@ -61,30 +63,30 @@ public class RandomAcessFileZIMInputStream extends InputStream { } // TODO: Remove the parameter buffer - public int readEightLittleEndianBytesValue(final byte[] buffer) + public long readEightLittleEndianBytesLong(final byte[] buffer) throws IOException { if (buffer.length < 8) { throw new OutOfMemoryError("buffer too small"); } else { this.mRAFReader.read(buffer, 0, 8); - return Utilities.toEightLittleEndianInteger(buffer); + return Utilities.toEightLittleEndianLong(buffer); } } // TODO: Remove the parameter buffer - public int readSixteenLittleEndianBytesValue(final byte[] buffer) + public long readSixteenLittleEndianBytesLong(final byte[] buffer) throws IOException { if (buffer.length < 16) { throw new OutOfMemoryError("buffer too small"); } else { this.mRAFReader.read(buffer, 0, 16); - return Utilities.toSixteenLittleEndianInteger(buffer); + return Utilities.toSixteenLittleEndianLong(buffer); } } // Reads characters from the current position into a String and stops when a // '\0' is encountered - public String readString() throws IOException { + public String readZeroTerminatedString() throws IOException { final StringBuffer sb = new StringBuffer(); /* * int i; byte[] buffer = new byte[100]; while (true) { @@ -92,8 +94,7 @@ public class RandomAcessFileZIMInputStream extends InputStream { * (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i * != buffer.length) break; } return sb.toString(); */ - int b; - b = this.mRAFReader.read(); + int b = this.mRAFReader.read(); while (b != '\0') { sb.append((char) b); b = this.mRAFReader.read(); diff --git a/source/org/openzim/RedirectEntry.java b/source/org/openzim/RedirectEntry.java deleted file mode 100644 index fdbe3fba1..000000000 --- a/source/org/openzim/RedirectEntry.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (C) 2011 Arunesh Mathur - * - * This file is a part of zimreader-java. - * - * zimreader-java is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License version 3.0 as - * published by the Free Software Foundation. - * - * zimreader-java is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with zimreader-java. If not, see . - */ - -package org.openzim; - -public class RedirectEntry extends DirectoryEntry { - - int redirectIndex; - - public RedirectEntry(final int mimeType, final char namespace, final int revision, - final int redirectIndex, final String url, final String title, final int urlListindex) { - - super(mimeType, namespace, revision, url, title, urlListindex); - - this.redirectIndex = redirectIndex; - } - - public int getRedirectIndex() { - return this.redirectIndex; - } - -} diff --git a/source/org/openzim/Utilities.java b/source/org/openzim/Utilities.java index 0de337c9c..28572839b 100644 --- a/source/org/openzim/Utilities.java +++ b/source/org/openzim/Utilities.java @@ -22,18 +22,21 @@ package org.openzim; import java.io.IOException; import java.io.InputStream; +/** + * @author Arunesh Mathur + * A ZIM file implementation that stores the Header and the MIMETypeList + * + * @author Michael Christen + * int/long bugfix (did reading of long values with int variables, causing negative offsets) + */ public class Utilities { - // TODO: Write a binary search algorithm - public static int binarySearch() { - return -1; - } - public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException { if (buffer.length < 2) { throw new OutOfMemoryError("buffer too small"); } else { - final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)); + final int result = + ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)); return result; } } @@ -42,39 +45,28 @@ public class Utilities { if (buffer.length < 4) { throw new OutOfMemoryError("buffer too small"); } else { - final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) + final int result = + ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)); return result; } } - public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException { + public static long toEightLittleEndianLong(final byte[] buffer) throws IOException { if (buffer.length < 8) { throw new OutOfMemoryError("buffer too small"); } else { - final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) - | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24) - | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40) - | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)); + final long result = // cast to long required otherwise this is again an integer + ((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8) + | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24) + | ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40) + | ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56)); return result; } } - public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException { - if (buffer.length < 16) { - throw new OutOfMemoryError("buffer too small"); - } else { - final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) - | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24) - | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40) - | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56) - | ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72) - | ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88) - | ((buffer[12] & 0xFF) << 96) - | ((buffer[13] & 0xFF) << 104) - | ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120)); - return result; - } + public static long toSixteenLittleEndianLong(final byte[] buffer) throws IOException { + return toEightLittleEndianLong(buffer); // there are no sixten bytes long values } public static void skipFully(final InputStream stream, final long bytes) throws IOException { diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index c86119be1..56e84ad17 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -20,46 +20,47 @@ package org.openzim; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.List; /** * @author Arunesh Mathur - * * A ZIM file implementation that stores the Header and the MIMETypeList * + * @author Michael Christen + * Proof-Reading, unclustering, refactoring, + * naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format, + * change of Exception handling, + * extension to more attributes as defined in spec (bugfix for mime type loading) + * int/long bugfix (did reading of long values with int variables, causing negative offsets) */ public class ZIMFile extends File { - /** - * - */ private static final long serialVersionUID = 1L; - private Header mHeader; - - private List mMIMETypeList; // Can be removed if not needed - - public ZIMFile(final String path) { + // Header values + public final int header_magicNumber; + public final int header_majorVersion; + public final int header_minorVersion; + public final long header_uuid; + public final int header_entryCount; + public final int header_clusterCount; + public final long header_urlPtrPos; + public final long header_titlePtrPos; + public final long header_clusterPtrPos; + public final long header_mimeListPos; + public final int header_mainPage; + public final int header_layoutPage; + public final long header_checksumPos; + + // content cache + public final List mimeList; + + public ZIMFile(final String path) throws IOException { super(path); - try { - readHeader(); - } catch (final FileNotFoundException e) { - e.printStackTrace(); - } - } - - private void readHeader() throws FileNotFoundException { - - // Helpers - int len = 0; - StringBuffer mimeBuffer = null; - - // The byte[] that will help us in reading bytes out of the file - final byte[] buffer = new byte[16]; - // Check whether the file exists if (!(this.exists())) { throw new FileNotFoundException( @@ -67,132 +68,45 @@ public class ZIMFile extends File { } // The reader that will be used to read contents from the file - - final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream( - new RandomAccessFile(this, "r")); - - // The ZIM file header - this.mHeader = new Header(); + final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this, "r")); + final byte[] buffer = new byte[16]; // Read the contents of the header - try { - this.mHeader.magicNumber = reader.readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.magicNumber); - - this.mHeader.version = reader.readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.version); - - this.mHeader.uuid = reader.readSixteenLittleEndianBytesValue(buffer); - // System.out.println(mHeader.uuid); reader.read(buffer, 0, 4); - - this.mHeader.articleCount = reader - .readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.articleCount); - - this.mHeader.clusterCount = reader - .readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.clusterCount); - - this.mHeader.urlPtrPos = reader.readEightLittleEndianBytesValue(buffer); - // System.out.println(mHeader.urlPtrPos); - - this.mHeader.titlePtrPos = reader - .readEightLittleEndianBytesValue(buffer); - // System.out.println(mHeader.titlePtrPos); - - this.mHeader.clusterPtrPos = reader - .readEightLittleEndianBytesValue(buffer); - // System.out.println(mHeader.clusterPtrPos); - - this.mHeader.mimeListPos = reader - .readEightLittleEndianBytesValue(buffer); - // System.out.println(mHeader.mimeListPos); - - this.mHeader.mainPage = reader.readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.mainPage); - - this.mHeader.layoutPage = reader.readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.layoutPage); - - // Initialise the MIMETypeList - this.mMIMETypeList = new ArrayList<>(); - while (true) { + this.header_magicNumber = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_majorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 2 + this.header_minorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 4 + this.header_uuid = reader.readSixteenLittleEndianBytesLong(buffer); // 16 + this.header_entryCount = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_clusterCount = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8 + this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8 + this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8 + this.header_mimeListPos = reader.readEightLittleEndianBytesLong(buffer); // 8 + this.header_mainPage = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_layoutPage = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_checksumPos = reader.readEightLittleEndianBytesLong(buffer); // 8 [FIX!] + + // Initialise the MIMETypeList + int len = 0; + StringBuffer mimeBuffer = null; + this.mimeList = new ArrayList<>(); + while (true) { + reader.read(buffer, 0, 1); // read only one byte to check if this is a zero + len = 0; + mimeBuffer = new StringBuffer(); + while (buffer[0] != '\0') { + mimeBuffer.append((char) buffer[0]); reader.read(buffer, 0, 1); - len = 0; - mimeBuffer = new StringBuffer(); - while (buffer[0] != '\0') { - mimeBuffer.append((char) buffer[0]); - reader.read(buffer, 0, 1); - len++; - } - if (len == 0) { - break; - } - this.mMIMETypeList.add(mimeBuffer.toString()); - // System.out.println(mimeBuffer); + len++; } - - } catch (final Exception e) { - e.printStackTrace(); + if (len == 0) { + break; + } + String mimeType = mimeBuffer.toString(); + System.out.println(mimeType); + this.mimeList.add(mimeType); } - } - - public int getVersion() { - return this.mHeader.version; - } - - public int getUuid() { - return this.mHeader.uuid; - } - - public int getArticleCount() { - return this.mHeader.articleCount; - } - - public int getClusterCount() { - return this.mHeader.clusterCount; - } - - public int getUrlPtrPos() { - return this.mHeader.urlPtrPos; - } - - public int getTitlePtrPos() { - return this.mHeader.titlePtrPos; - } - - public int getClusterPtrPos() { - return this.mHeader.clusterPtrPos; - } - - public String getMIMEType(final int mimeNumber) { - return this.mMIMETypeList.get(mimeNumber); - } - - public int getHeaderSize() { - return this.mHeader.mimeListPos; - } - - public int getMainPage() { - return this.mHeader.mainPage; - } - - public int getLayoutPage() { - return this.mHeader.layoutPage; - } - public class Header { - int magicNumber; - int version; - int uuid; - int articleCount; - int clusterCount; - int urlPtrPos; - int titlePtrPos; - int clusterPtrPos; - int mimeListPos; - int mainPage; - int layoutPage; } } diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index affd6ea6d..49d25c50c 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -29,28 +29,91 @@ import org.tukaani.xz.SingleXZInputStream; /** * @author Arunesh Mathur - * * A ZIMReader that reads data from the ZIMFile * + * @author Michael Christen + * Proof-Reading, unclustering, refactoring, + * naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format, + * change of Exception handling, + * extension to more attributes as defined in spec (bugfix for mime type loading) + * bugfix to long parsing (prevented reading of large files) */ public class ZIMReader { private final ZIMFile mFile; private RandomAcessFileZIMInputStream mReader; + public static abstract class DirectoryEntry { + + public final int mimetype; + public final char namespace; + public final int cluster_number; + public final String url; + public final String title; + public final long urlListindex; + + public DirectoryEntry( + final int mimeType, final char namespace, + final int cluster_number, + final String url, final String title, + final long index) { + this.mimetype = mimeType; + this.namespace = namespace; + this.cluster_number = cluster_number; + this.url = url; + this.title = title; + this.urlListindex = index; + } + + } + + public static class ArticleEntry extends DirectoryEntry { + + public final int cluster_number; + public final int blob_number; + + public ArticleEntry( + final int mimeType, final char namespace, + final int cluster_number, final int blob_number, + final String url, final String title, + final long urlListindex) { + super(mimeType, namespace, cluster_number, url, title, urlListindex); + this.cluster_number = cluster_number; + this.blob_number = blob_number; + } + + } + + public static class RedirectEntry extends DirectoryEntry { + + public final long redirect_index; + + public RedirectEntry(final int mimeType, final char namespace, + final long redirect_index, final String url, final String title, + final long urlListindex) { + super(mimeType, namespace, 0, url, title, urlListindex); + this.redirect_index = redirect_index; + } + + } + public ZIMReader(final ZIMFile file) { this.mFile = file; try { - this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile( - this.mFile, "r")); + this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this.mFile, "r")); } catch (final FileNotFoundException e) { e.printStackTrace(); } } + public ZIMFile getZIMFile() { + return this.mFile; + } + + // get a URL list that is sorted by the urls public List getURLListByURL() throws IOException { - int i = 0, pos, mimeType; + int i = 0, mimeType; final byte[] buffer = new byte[8]; @@ -58,12 +121,12 @@ public class ZIMReader { final ArrayList returnList = new ArrayList<>(); // Move to the spot where URL's are listed - this.mReader.seek(this.mFile.getUrlPtrPos()); + this.mReader.seek(this.mFile.header_urlPtrPos); - for (i = 0; i < this.mFile.getArticleCount(); i++) { + for (i = 0; i < this.mFile.header_entryCount; i++) { // The position of URL i - pos = this.mReader.readEightLittleEndianBytesValue(buffer); + long pos = this.mReader.readEightLittleEndianBytesLong(buffer); // Mark the current position that we need to return to this.mReader.mark(); @@ -72,14 +135,14 @@ public class ZIMReader { this.mReader.seek(pos); // Article or Redirect entry? - mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer); + mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer); if (mimeType == 65535) { this.mReader.seek(pos + 12); - returnList.add(this.mReader.readString()); + returnList.add(this.mReader.readZeroTerminatedString()); } else { this.mReader.seek(pos + 16); - returnList.add(this.mReader.readString()); + returnList.add(this.mReader.readZeroTerminatedString()); } this.mReader.reset(); @@ -88,9 +151,10 @@ public class ZIMReader { return returnList; } + // get a URL list that is sorted by the entry titles public List getURLListByTitle() throws IOException { - int i = 0, pos, mimeType, articleNumber, urlPtrPos; + int i = 0, mimeType, articleNumber; final byte[] buffer = new byte[8]; @@ -98,35 +162,35 @@ public class ZIMReader { final ArrayList returnList = new ArrayList<>(); // Get the UrlPtrPos or one time storage - urlPtrPos = this.mFile.getUrlPtrPos(); + long urlPtrPos = this.mFile.header_urlPtrPos; // Move to the spot where URL's are listed - this.mReader.seek(this.mFile.getTitlePtrPos()); + this.mReader.seek(this.mFile.header_titlePtrPos); - for (i = 0; i < this.mFile.getArticleCount(); i++) { + for (i = 0; i < this.mFile.header_entryCount; i++) { // The articleNumber of the position of URL i - articleNumber = this.mReader.readFourLittleEndianBytesValue(buffer); + articleNumber = this.mReader.readFourLittleEndianBytesInt(buffer); // Mark the current position that we need to return to this.mReader.mark(); - this.mReader.seek(urlPtrPos + (8 * (articleNumber))); + this.mReader.seek(urlPtrPos + (8L * (articleNumber))); // The position of URL i - pos = this.mReader.readEightLittleEndianBytesValue(buffer); + long pos = this.mReader.readEightLittleEndianBytesLong(buffer); this.mReader.seek(pos); // Article or Redirect entry? - mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer); + mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer); if (mimeType == 65535) { this.mReader.seek(pos + 12); - final String url = this.mReader.readString(); + final String url = this.mReader.readZeroTerminatedString(); returnList.add(url); } else { this.mReader.seek(pos + 16); - final String url = this.mReader.readString(); + final String url = this.mReader.readZeroTerminatedString(); returnList.add(url); } @@ -137,14 +201,69 @@ public class ZIMReader { return returnList; } + // position must be the seek position for the title in the Title Pointer List + private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException { + + // Helpers + final byte[] buffer = new byte[8]; + + // At the appropriate position in the titlePtrPos + this.mReader.seek(position); + + // Get value of article at index + int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(buffer); + + // Move to the position in urlPtrPos + this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer); + + // Get value of article in urlPtrPos + long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(buffer); + + // Go to the location of the directory entry + this.mReader.seek(pointer_to_the_directory_entry); + + // read the Content Entry + final int type = this.mReader.readTwoLittleEndianBytesInt(buffer); // 2, 0xffff for redirect + this.mReader.read(); // 1, ignore, parameter length not used + final char namespace = (char) this.mReader.read(); // 1 + this.mReader.readFourLittleEndianBytesInt(buffer); // 4, ignore, revision not used + + // Article or Redirect entry + if (type == 65535) { + final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(buffer); + final String url = this.mReader.readZeroTerminatedString(); + String title = this.mReader.readZeroTerminatedString(); + title = title.equals("") ? url : title; + return new RedirectEntry(type, namespace, redirectIndex, + url, title, (position - this.mFile.header_urlPtrPos) / 8); + } else { + final int cluster_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4 + final int blob_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4 + final String url = this.mReader.readZeroTerminatedString(); // zero terminated + String title = this.mReader.readZeroTerminatedString(); // zero terminated + title = title.equals("") ? url : title; + + return new ArticleEntry( + type, namespace, + cluster_number, blob_number, + url, title, (position - this.mFile.header_urlPtrPos) / 8); + } + + } + + public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException { + if (entryNumber >= this.mFile.header_entryCount) throw new IOException("entryNumber exceeds entryCount"); + return getDirectoryInfoAtTitlePosition(this.mFile.header_titlePtrPos + 4 * entryNumber); + } + // Gives the minimum required information needed for the given articleName - public DirectoryEntry getDirectoryInfo(String articleName, final char namespace) - throws IOException { + // This makes a binary search on the article name entry list. + public DirectoryEntry getDirectoryInfo(final char namespace, String articleName) throws IOException { DirectoryEntry entry; String cmpStr; - final int numberOfArticles = this.mFile.getArticleCount(); - int beg = this.mFile.getTitlePtrPos(), end = beg + (numberOfArticles * 4), mid; + final int numberOfArticles = this.mFile.header_entryCount; + long beg = this.mFile.header_titlePtrPos, end = beg + (numberOfArticles * 4), mid; articleName = namespace + "/" + articleName; @@ -154,7 +273,7 @@ public class ZIMReader { if (entry == null) { return null; } - cmpStr = entry.getNamespace() + "/" + entry.getUrl(); + cmpStr = entry.namespace + "/" + entry.url; if (articleName.compareTo(cmpStr) < 0) { end = mid - 4; @@ -167,242 +286,130 @@ public class ZIMReader { } return null; - } - public ByteArrayOutputStream getArticleData(final String articleName, final char namespace) throws IOException { - - // search in the cache first, if not found, then call getDirectoryInfo(articleName) - - byte[] buffer = new byte[8]; - - final DirectoryEntry mainEntry = getDirectoryInfo(articleName, namespace); - - if (mainEntry != null) { - - // Check what kind of an entry was mainEnrty - if (mainEntry.getClass() == ArticleEntry.class) { - - // Cast to ArticleEntry - final ArticleEntry article = (ArticleEntry) mainEntry; - - // Get the cluster and blob numbers from the article - final int clusterNumber = article.getClusterNumber(); - final int blobNumber = article.getBlobnumber(); - - // Move to the cluster entry in the clusterPtrPos - this.mReader.seek(this.mFile.getClusterPtrPos() + clusterNumber * 8); - - // Read the location of the cluster - final int clusterPos = this.mReader - .readEightLittleEndianBytesValue(buffer); - - // Move to the cluster - this.mReader.seek(clusterPos); + public ByteArrayOutputStream getArticleData(final DirectoryEntry directoryInfo) throws IOException { - // Read the first byte, for compression information - final int compressionType = this.mReader.read(); + // fail fast + if (directoryInfo == null) return null; + if (directoryInfo.getClass() != ArticleEntry.class) return null; - // Reference declaration - SingleXZInputStream xzReader = null; - int firstOffset, numberOfBlobs, offset1, - offset2, - location, - differenceOffset; + // This is now an article, so thus we can cast to ArticleEntry + final ArticleEntry article = (ArticleEntry) directoryInfo; - ByteArrayOutputStream baos; + // Move to the cluster entry in the clusterPtrPos + this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8); - // Check the compression type that was read - switch (compressionType) { - - // TODO: Read uncompressed data directly - case 0: - case 1: - - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - - // Read the first offset - this.mReader.read(buffer); - - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); - - // The number of blobs - numberOfBlobs = firstOffset / 4; - - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; - - - if (blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - - location = (blobNumber - 1) * 4; - Utilities.skipFully(this.mReader, location); - this.mReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } - - this.mReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); - - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - - Utilities.skipFully(this.mReader, - (offset1 - 4 * (blobNumber + 2))); - - this.mReader.read(buffer, 0, differenceOffset); - - baos = new ByteArrayOutputStream(); - baos.write(buffer, 0, differenceOffset); - - return baos; - - // LZMA2 compressed data - case 4: - - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - xzReader = new SingleXZInputStream(this.mReader, 4194304); - - // Read the first offset - xzReader.read(buffer); + // Read the location of the cluster + byte[] buffer = new byte[8]; + final long clusterPos = this.mReader.readEightLittleEndianBytesLong(buffer); - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); + // Move to the cluster + this.mReader.seek(clusterPos); - // The number of blobs - numberOfBlobs = firstOffset / 4; + // Read the first byte, for compression information + final int compressionType = this.mReader.read(); - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; + // Reference declaration + SingleXZInputStream xzReader = null; + int firstOffset, numberOfBlobs, offset1, + offset2, + location, + differenceOffset; - if(blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { + ByteArrayOutputStream baos; - location = (blobNumber - 1) * 4; - Utilities.skipFully(xzReader, location); - xzReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } + // Check the compression type that was read + switch (compressionType) { - xzReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); + // TODO: Read uncompressed data directly + case 0: + case 1: - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; + // Read the first 4 bytes to find out the number of artciles + buffer = new byte[4]; - Utilities.skipFully(xzReader, - (offset1 - 4 * (blobNumber + 2))); + // Create a dictionary with size 40MiB, the zimlib uses this + // size while creating - xzReader.read(buffer, 0, differenceOffset); + // Read the first offset + this.mReader.read(buffer); - baos = new ByteArrayOutputStream(); - baos.write(buffer, 0, differenceOffset); + // The first four bytes are the offset of the zeroth blob + firstOffset = Utilities.toFourLittleEndianInteger(buffer); - return baos; + // The number of blobs + numberOfBlobs = firstOffset / 4; - } + // The blobNumber has to be lesser than the numberOfBlobs + assert article.blob_number < numberOfBlobs; + if (article.blob_number == 0) { + // The first offset is what we read earlier + offset1 = firstOffset; + } else { + location = (article.blob_number - 1) * 4; + Utilities.skipFully(this.mReader, location); + this.mReader.read(buffer); + offset1 = Utilities.toFourLittleEndianInteger(buffer); } - } - - return null; - - } - - public DirectoryEntry getDirectoryInfoAtTitlePosition(final int position) - throws IOException { - - // Helpers - int pos; - final byte[] buffer = new byte[8]; - - // At the appropriate position in the titlePtrPos - this.mReader.seek(position); - - // Get value of article at index - pos = this.mReader.readFourLittleEndianBytesValue(buffer); - - // Move to the position in urlPtrPos - this.mReader.seek(this.mFile.getUrlPtrPos() + 8 * pos); - - // Get value of article in urlPtrPos - pos = this.mReader.readEightLittleEndianBytesValue(buffer); - - // Go to the location of the directory entry - this.mReader.seek(pos); - - final int type = this.mReader.readTwoLittleEndianBytesValue(buffer); - - // Ignore the parameter length - this.mReader.read(); - - final char namespace = (char) this.mReader.read(); - // System.out.println("Namepsace: " + namespace); - final int revision = this.mReader.readFourLittleEndianBytesValue(buffer); - // System.out.println("Revision: " + revision); + this.mReader.read(buffer); + offset2 = Utilities.toFourLittleEndianInteger(buffer); + differenceOffset = offset2 - offset1; + buffer = new byte[differenceOffset]; + Utilities.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2))); + this.mReader.read(buffer, 0, differenceOffset); + baos = new ByteArrayOutputStream(); + baos.write(buffer, 0, differenceOffset); - // TODO: Remove redundant if condition code - // Article or Redirect entry - if (type == 65535) { - - // System.out.println("MIMEType: " + type); - - final int redirectIndex = this.mReader.readFourLittleEndianBytesValue(buffer); - // System.out.println("RedirectIndex: " + redirectIndex); + return baos; - final String url = this.mReader.readString(); - // System.out.println("URL: " + url); + // 2 for zlib and 3 for bzip2 (removed) - String title = this.mReader.readString(); - title = title.equals("") ? url : title; - // System.out.println("Title: " + title); - - return new RedirectEntry(type, namespace, revision, redirectIndex, - url, title, (position - this.mFile.getUrlPtrPos()) / 8); + // LZMA2 compressed data + case 4: - } else { + // Read the first 4 bytes to find out the number of artciles + buffer = new byte[4]; - // System.out.println("MIMEType: " + mFile.getMIMEType(type)); + // Create a dictionary with size 40MiB, the zimlib uses this size while creating + xzReader = new SingleXZInputStream(this.mReader, 4194304); - final int clusterNumber = this.mReader.readFourLittleEndianBytesValue(buffer); - // System.out.println("Cluster Number: " + clusterNumber); + // Read the first offset + xzReader.read(buffer); - final int blobNumber = this.mReader.readFourLittleEndianBytesValue(buffer); - // System.out.println("Blob Number: " + blobNumber); + // The first four bytes are the offset of the zeroth blob + firstOffset = Utilities.toFourLittleEndianInteger(buffer); - final String url = this.mReader.readString(); - // System.out.println("URL: " + url); + // The number of blobs + numberOfBlobs = firstOffset / 4; - String title = this.mReader.readString(); - title = title.equals("") ? url : title; - // System.out.println("Title: " + title); - - // Parameter data ignored + // The blobNumber has to be lesser than the numberOfBlobs + assert article.blob_number < numberOfBlobs; + if (article.blob_number == 0) { + // The first offset is what we read earlier + offset1 = firstOffset; + } else { + location = (article.blob_number - 1) * 4; + Utilities.skipFully(xzReader, location); + xzReader.read(buffer); + offset1 = Utilities.toFourLittleEndianInteger(buffer); + } - return new ArticleEntry(type, namespace, revision, clusterNumber, - blobNumber, url, title, - (position - this.mFile.getUrlPtrPos()) / 8); + xzReader.read(buffer); + offset2 = Utilities.toFourLittleEndianInteger(buffer); + differenceOffset = offset2 - offset1; + buffer = new byte[differenceOffset]; + Utilities.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2))); + xzReader.read(buffer, 0, differenceOffset); + baos = new ByteArrayOutputStream(); + baos.write(buffer, 0, differenceOffset); + return baos; + + // case 5: zstd compressed (missing!) + default: + return null; } - } - public ZIMFile getZIMFile() { - return this.mFile; - } } diff --git a/source/org/openzim/ZIMTest.java b/source/org/openzim/ZIMTest.java index 6d8ed64fb..24b9cf9be 100644 --- a/source/org/openzim/ZIMTest.java +++ b/source/org/openzim/ZIMTest.java @@ -18,27 +18,49 @@ package org.openzim; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.openzim.ZIMReader.DirectoryEntry; public class ZIMTest { + public static void main(final String[] args) { - if(args.length!=2) { - System.out.println("Usage: java ZIMTest "); + if(args.length!=1) { + System.out.println("Usage: java ZIMTest "); System.exit(0); } - // args[0] is the Zim File's location - final ZIMFile file = new ZIMFile(args[0]); + try { + // args[0] is the Zim File's location + final ZIMFile file = new ZIMFile(args[0]); + + // Associate the Zim File with a Reader + final ZIMReader zReader = new ZIMReader(file); - // Associate the Zim File with a Reader - final ZIMReader zReader = new ZIMReader(file); + // print a list of urls and titles + final List urls = zReader.getURLListByURL(); + final List titles = zReader.getURLListByTitle(); + int c = Math.min(10, titles.size()); + for (int i = 0; i < c; i++) { + System.out.println("URL by URL " + i + ": " + urls.get(i)); + System.out.println("URL by Title " + i + ": " + titles.get(i)); + DirectoryEntry entry = zReader.getDirectoryInfo(i); + System.out.println("URL by Pos " + i + ": " + entry.url); + System.out.println("Title by Pos " + i + ": " + entry.title); + System.out.println("Namespace by Pos " + i + ": " + entry.namespace); + } - try { - // args[1] is the name of the articles that is - // to be fetched - System.out.println(zReader.getArticleData(args[1],'A').toString("utf-8")); + // print article c-1 + DirectoryEntry directory_entry = zReader.getDirectoryInfo(c - 1); + ByteArrayOutputStream articleStream = zReader.getArticleData(directory_entry); + String article = articleStream == null ? "NULL" : articleStream.toString(StandardCharsets.UTF_8.name()); + System.out.println(article); } catch (final IOException e) { e.printStackTrace(); } } + }