diff --git a/source/org/openzim/ArticleEntry.java b/source/org/openzim/ArticleEntry.java deleted file mode 100644 index 7eeae2e06..000000000 --- a/source/org/openzim/ArticleEntry.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (C) 2011 Arunesh Mathur - * - * This file is a part of zimreader-java. - * - * zimreader-java is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License version 3.0 as - * published by the Free Software Foundation. - * - * zimreader-java is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with zimreader-java. If not, see . - */ - - -package org.openzim; - -public class ArticleEntry extends DirectoryEntry { - - int clusterNumber; - - int blobnumber; - - public ArticleEntry(final int mimeType, final char namespace, final int revision, - final int clusterNumber, final int blobNumber, final String url, final String title, - final int urlListindex) { - - super(mimeType, namespace, revision, url, title, urlListindex); - - this.clusterNumber = clusterNumber; - this.blobnumber = blobNumber; - } - - public int getClusterNumber() { - return this.clusterNumber; - } - - public int getBlobnumber() { - return this.blobnumber; - } - -} diff --git a/source/org/openzim/DirectoryEntry.java b/source/org/openzim/DirectoryEntry.java deleted file mode 100644 index 92c52de41..000000000 --- a/source/org/openzim/DirectoryEntry.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (C) 2011 Arunesh Mathur - * - * This file is a part of zimreader-java. - * - * zimreader-java is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License version 3.0 as - * published by the Free Software Foundation. - * - * zimreader-java is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with zimreader-java. If not, see . - */ - -package org.openzim; - -public abstract class DirectoryEntry { - - int mimeType; - - char namespace; - - int revision; - - String url; - - String title; - - int urlListindex; - - public DirectoryEntry(final int mimeType, final char namespace, final int revision, - final String url, final String title, final int index) { - this.mimeType = mimeType; - this.namespace = namespace; - this.revision = revision; - this.url = url; - this.title = title; - this.urlListindex = index; - } - - public int getMimeType() { - return this.mimeType; - } - - public char getNamespace() { - return this.namespace; - } - - public int getRevision() { - return this.revision; - } - - public String getUrl() { - return this.url; - } - - public String getTitle() { - return this.title; - } - - public int getUrlListindex() { - return this.urlListindex; - } - -} diff --git a/source/org/openzim/RandomAcessFileZIMInputStream.java b/source/org/openzim/RandomAcessFileZIMInputStream.java index 006dd4498..cb6cdb093 100644 --- a/source/org/openzim/RandomAcessFileZIMInputStream.java +++ b/source/org/openzim/RandomAcessFileZIMInputStream.java @@ -28,6 +28,8 @@ import java.io.RandomAccessFile; * implementation, can be improved. * * @author Arunesh Mathur + * @author Michael Christen + * bugfix to long parsing (return value was int) */ public class RandomAcessFileZIMInputStream extends InputStream { @@ -41,7 +43,7 @@ public class RandomAcessFileZIMInputStream extends InputStream { } // TODO: Remove the parameter buffer - public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException { + public int readTwoLittleEndianBytesInt(final byte[] buffer) throws IOException { if (buffer.length < 2) { throw new OutOfMemoryError("buffer too small"); } else { @@ -51,7 +53,7 @@ public class RandomAcessFileZIMInputStream extends InputStream { } // TODO: Remove the parameter buffer - public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException { + public int readFourLittleEndianBytesInt(final byte[] buffer) throws IOException { if (buffer.length < 4) { throw new OutOfMemoryError("buffer too small"); } else { @@ -61,30 +63,30 @@ public class RandomAcessFileZIMInputStream extends InputStream { } // TODO: Remove the parameter buffer - public int readEightLittleEndianBytesValue(final byte[] buffer) + public long readEightLittleEndianBytesLong(final byte[] buffer) throws IOException { if (buffer.length < 8) { throw new OutOfMemoryError("buffer too small"); } else { this.mRAFReader.read(buffer, 0, 8); - return Utilities.toEightLittleEndianInteger(buffer); + return Utilities.toEightLittleEndianLong(buffer); } } // TODO: Remove the parameter buffer - public int readSixteenLittleEndianBytesValue(final byte[] buffer) + public long readSixteenLittleEndianBytesLong(final byte[] buffer) throws IOException { if (buffer.length < 16) { throw new OutOfMemoryError("buffer too small"); } else { this.mRAFReader.read(buffer, 0, 16); - return Utilities.toSixteenLittleEndianInteger(buffer); + return Utilities.toSixteenLittleEndianLong(buffer); } } // Reads characters from the current position into a String and stops when a // '\0' is encountered - public String readString() throws IOException { + public String readZeroTerminatedString() throws IOException { final StringBuffer sb = new StringBuffer(); /* * int i; byte[] buffer = new byte[100]; while (true) { @@ -92,8 +94,7 @@ public class RandomAcessFileZIMInputStream extends InputStream { * (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i * != buffer.length) break; } return sb.toString(); */ - int b; - b = this.mRAFReader.read(); + int b = this.mRAFReader.read(); while (b != '\0') { sb.append((char) b); b = this.mRAFReader.read(); diff --git a/source/org/openzim/RedirectEntry.java b/source/org/openzim/RedirectEntry.java deleted file mode 100644 index fdbe3fba1..000000000 --- a/source/org/openzim/RedirectEntry.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (C) 2011 Arunesh Mathur - * - * This file is a part of zimreader-java. - * - * zimreader-java is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License version 3.0 as - * published by the Free Software Foundation. - * - * zimreader-java is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with zimreader-java. If not, see . - */ - -package org.openzim; - -public class RedirectEntry extends DirectoryEntry { - - int redirectIndex; - - public RedirectEntry(final int mimeType, final char namespace, final int revision, - final int redirectIndex, final String url, final String title, final int urlListindex) { - - super(mimeType, namespace, revision, url, title, urlListindex); - - this.redirectIndex = redirectIndex; - } - - public int getRedirectIndex() { - return this.redirectIndex; - } - -} diff --git a/source/org/openzim/Utilities.java b/source/org/openzim/Utilities.java index 0de337c9c..28572839b 100644 --- a/source/org/openzim/Utilities.java +++ b/source/org/openzim/Utilities.java @@ -22,18 +22,21 @@ package org.openzim; import java.io.IOException; import java.io.InputStream; +/** + * @author Arunesh Mathur + * A ZIM file implementation that stores the Header and the MIMETypeList + * + * @author Michael Christen + * int/long bugfix (did reading of long values with int variables, causing negative offsets) + */ public class Utilities { - // TODO: Write a binary search algorithm - public static int binarySearch() { - return -1; - } - public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException { if (buffer.length < 2) { throw new OutOfMemoryError("buffer too small"); } else { - final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)); + final int result = + ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)); return result; } } @@ -42,39 +45,28 @@ public class Utilities { if (buffer.length < 4) { throw new OutOfMemoryError("buffer too small"); } else { - final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) + final int result = + ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)); return result; } } - public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException { + public static long toEightLittleEndianLong(final byte[] buffer) throws IOException { if (buffer.length < 8) { throw new OutOfMemoryError("buffer too small"); } else { - final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) - | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24) - | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40) - | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)); + final long result = // cast to long required otherwise this is again an integer + ((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8) + | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24) + | ((long)(buffer[4] & 0xFF) << 32) | ((long)(buffer[5] & 0xFF) << 40) + | ((long)(buffer[6] & 0xFF) << 48) | ((long)(buffer[7] & 0xFF) << 56)); return result; } } - public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException { - if (buffer.length < 16) { - throw new OutOfMemoryError("buffer too small"); - } else { - final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) - | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24) - | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40) - | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56) - | ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72) - | ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88) - | ((buffer[12] & 0xFF) << 96) - | ((buffer[13] & 0xFF) << 104) - | ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120)); - return result; - } + public static long toSixteenLittleEndianLong(final byte[] buffer) throws IOException { + return toEightLittleEndianLong(buffer); // there are no sixten bytes long values } public static void skipFully(final InputStream stream, final long bytes) throws IOException { diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index c86119be1..56e84ad17 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -20,46 +20,47 @@ package org.openzim; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.List; /** * @author Arunesh Mathur - * * A ZIM file implementation that stores the Header and the MIMETypeList * + * @author Michael Christen + * Proof-Reading, unclustering, refactoring, + * naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format, + * change of Exception handling, + * extension to more attributes as defined in spec (bugfix for mime type loading) + * int/long bugfix (did reading of long values with int variables, causing negative offsets) */ public class ZIMFile extends File { - /** - * - */ private static final long serialVersionUID = 1L; - private Header mHeader; - - private List mMIMETypeList; // Can be removed if not needed - - public ZIMFile(final String path) { + // Header values + public final int header_magicNumber; + public final int header_majorVersion; + public final int header_minorVersion; + public final long header_uuid; + public final int header_entryCount; + public final int header_clusterCount; + public final long header_urlPtrPos; + public final long header_titlePtrPos; + public final long header_clusterPtrPos; + public final long header_mimeListPos; + public final int header_mainPage; + public final int header_layoutPage; + public final long header_checksumPos; + + // content cache + public final List mimeList; + + public ZIMFile(final String path) throws IOException { super(path); - try { - readHeader(); - } catch (final FileNotFoundException e) { - e.printStackTrace(); - } - } - - private void readHeader() throws FileNotFoundException { - - // Helpers - int len = 0; - StringBuffer mimeBuffer = null; - - // The byte[] that will help us in reading bytes out of the file - final byte[] buffer = new byte[16]; - // Check whether the file exists if (!(this.exists())) { throw new FileNotFoundException( @@ -67,132 +68,45 @@ public class ZIMFile extends File { } // The reader that will be used to read contents from the file - - final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream( - new RandomAccessFile(this, "r")); - - // The ZIM file header - this.mHeader = new Header(); + final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this, "r")); + final byte[] buffer = new byte[16]; // Read the contents of the header - try { - this.mHeader.magicNumber = reader.readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.magicNumber); - - this.mHeader.version = reader.readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.version); - - this.mHeader.uuid = reader.readSixteenLittleEndianBytesValue(buffer); - // System.out.println(mHeader.uuid); reader.read(buffer, 0, 4); - - this.mHeader.articleCount = reader - .readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.articleCount); - - this.mHeader.clusterCount = reader - .readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.clusterCount); - - this.mHeader.urlPtrPos = reader.readEightLittleEndianBytesValue(buffer); - // System.out.println(mHeader.urlPtrPos); - - this.mHeader.titlePtrPos = reader - .readEightLittleEndianBytesValue(buffer); - // System.out.println(mHeader.titlePtrPos); - - this.mHeader.clusterPtrPos = reader - .readEightLittleEndianBytesValue(buffer); - // System.out.println(mHeader.clusterPtrPos); - - this.mHeader.mimeListPos = reader - .readEightLittleEndianBytesValue(buffer); - // System.out.println(mHeader.mimeListPos); - - this.mHeader.mainPage = reader.readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.mainPage); - - this.mHeader.layoutPage = reader.readFourLittleEndianBytesValue(buffer); - // System.out.println(mHeader.layoutPage); - - // Initialise the MIMETypeList - this.mMIMETypeList = new ArrayList<>(); - while (true) { + this.header_magicNumber = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_majorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 2 + this.header_minorVersion = reader.readTwoLittleEndianBytesInt(buffer); // 4 + this.header_uuid = reader.readSixteenLittleEndianBytesLong(buffer); // 16 + this.header_entryCount = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_clusterCount = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_urlPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8 + this.header_titlePtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8 + this.header_clusterPtrPos = reader.readEightLittleEndianBytesLong(buffer); // 8 + this.header_mimeListPos = reader.readEightLittleEndianBytesLong(buffer); // 8 + this.header_mainPage = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_layoutPage = reader.readFourLittleEndianBytesInt(buffer); // 4 + this.header_checksumPos = reader.readEightLittleEndianBytesLong(buffer); // 8 [FIX!] + + // Initialise the MIMETypeList + int len = 0; + StringBuffer mimeBuffer = null; + this.mimeList = new ArrayList<>(); + while (true) { + reader.read(buffer, 0, 1); // read only one byte to check if this is a zero + len = 0; + mimeBuffer = new StringBuffer(); + while (buffer[0] != '\0') { + mimeBuffer.append((char) buffer[0]); reader.read(buffer, 0, 1); - len = 0; - mimeBuffer = new StringBuffer(); - while (buffer[0] != '\0') { - mimeBuffer.append((char) buffer[0]); - reader.read(buffer, 0, 1); - len++; - } - if (len == 0) { - break; - } - this.mMIMETypeList.add(mimeBuffer.toString()); - // System.out.println(mimeBuffer); + len++; } - - } catch (final Exception e) { - e.printStackTrace(); + if (len == 0) { + break; + } + String mimeType = mimeBuffer.toString(); + System.out.println(mimeType); + this.mimeList.add(mimeType); } - } - - public int getVersion() { - return this.mHeader.version; - } - - public int getUuid() { - return this.mHeader.uuid; - } - - public int getArticleCount() { - return this.mHeader.articleCount; - } - - public int getClusterCount() { - return this.mHeader.clusterCount; - } - - public int getUrlPtrPos() { - return this.mHeader.urlPtrPos; - } - - public int getTitlePtrPos() { - return this.mHeader.titlePtrPos; - } - - public int getClusterPtrPos() { - return this.mHeader.clusterPtrPos; - } - - public String getMIMEType(final int mimeNumber) { - return this.mMIMETypeList.get(mimeNumber); - } - - public int getHeaderSize() { - return this.mHeader.mimeListPos; - } - - public int getMainPage() { - return this.mHeader.mainPage; - } - - public int getLayoutPage() { - return this.mHeader.layoutPage; - } - public class Header { - int magicNumber; - int version; - int uuid; - int articleCount; - int clusterCount; - int urlPtrPos; - int titlePtrPos; - int clusterPtrPos; - int mimeListPos; - int mainPage; - int layoutPage; } } diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index affd6ea6d..49d25c50c 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -29,28 +29,91 @@ import org.tukaani.xz.SingleXZInputStream; /** * @author Arunesh Mathur - * * A ZIMReader that reads data from the ZIMFile * + * @author Michael Christen + * Proof-Reading, unclustering, refactoring, + * naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format, + * change of Exception handling, + * extension to more attributes as defined in spec (bugfix for mime type loading) + * bugfix to long parsing (prevented reading of large files) */ public class ZIMReader { private final ZIMFile mFile; private RandomAcessFileZIMInputStream mReader; + public static abstract class DirectoryEntry { + + public final int mimetype; + public final char namespace; + public final int cluster_number; + public final String url; + public final String title; + public final long urlListindex; + + public DirectoryEntry( + final int mimeType, final char namespace, + final int cluster_number, + final String url, final String title, + final long index) { + this.mimetype = mimeType; + this.namespace = namespace; + this.cluster_number = cluster_number; + this.url = url; + this.title = title; + this.urlListindex = index; + } + + } + + public static class ArticleEntry extends DirectoryEntry { + + public final int cluster_number; + public final int blob_number; + + public ArticleEntry( + final int mimeType, final char namespace, + final int cluster_number, final int blob_number, + final String url, final String title, + final long urlListindex) { + super(mimeType, namespace, cluster_number, url, title, urlListindex); + this.cluster_number = cluster_number; + this.blob_number = blob_number; + } + + } + + public static class RedirectEntry extends DirectoryEntry { + + public final long redirect_index; + + public RedirectEntry(final int mimeType, final char namespace, + final long redirect_index, final String url, final String title, + final long urlListindex) { + super(mimeType, namespace, 0, url, title, urlListindex); + this.redirect_index = redirect_index; + } + + } + public ZIMReader(final ZIMFile file) { this.mFile = file; try { - this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile( - this.mFile, "r")); + this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(this.mFile, "r")); } catch (final FileNotFoundException e) { e.printStackTrace(); } } + public ZIMFile getZIMFile() { + return this.mFile; + } + + // get a URL list that is sorted by the urls public List getURLListByURL() throws IOException { - int i = 0, pos, mimeType; + int i = 0, mimeType; final byte[] buffer = new byte[8]; @@ -58,12 +121,12 @@ public class ZIMReader { final ArrayList returnList = new ArrayList<>(); // Move to the spot where URL's are listed - this.mReader.seek(this.mFile.getUrlPtrPos()); + this.mReader.seek(this.mFile.header_urlPtrPos); - for (i = 0; i < this.mFile.getArticleCount(); i++) { + for (i = 0; i < this.mFile.header_entryCount; i++) { // The position of URL i - pos = this.mReader.readEightLittleEndianBytesValue(buffer); + long pos = this.mReader.readEightLittleEndianBytesLong(buffer); // Mark the current position that we need to return to this.mReader.mark(); @@ -72,14 +135,14 @@ public class ZIMReader { this.mReader.seek(pos); // Article or Redirect entry? - mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer); + mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer); if (mimeType == 65535) { this.mReader.seek(pos + 12); - returnList.add(this.mReader.readString()); + returnList.add(this.mReader.readZeroTerminatedString()); } else { this.mReader.seek(pos + 16); - returnList.add(this.mReader.readString()); + returnList.add(this.mReader.readZeroTerminatedString()); } this.mReader.reset(); @@ -88,9 +151,10 @@ public class ZIMReader { return returnList; } + // get a URL list that is sorted by the entry titles public List getURLListByTitle() throws IOException { - int i = 0, pos, mimeType, articleNumber, urlPtrPos; + int i = 0, mimeType, articleNumber; final byte[] buffer = new byte[8]; @@ -98,35 +162,35 @@ public class ZIMReader { final ArrayList returnList = new ArrayList<>(); // Get the UrlPtrPos or one time storage - urlPtrPos = this.mFile.getUrlPtrPos(); + long urlPtrPos = this.mFile.header_urlPtrPos; // Move to the spot where URL's are listed - this.mReader.seek(this.mFile.getTitlePtrPos()); + this.mReader.seek(this.mFile.header_titlePtrPos); - for (i = 0; i < this.mFile.getArticleCount(); i++) { + for (i = 0; i < this.mFile.header_entryCount; i++) { // The articleNumber of the position of URL i - articleNumber = this.mReader.readFourLittleEndianBytesValue(buffer); + articleNumber = this.mReader.readFourLittleEndianBytesInt(buffer); // Mark the current position that we need to return to this.mReader.mark(); - this.mReader.seek(urlPtrPos + (8 * (articleNumber))); + this.mReader.seek(urlPtrPos + (8L * (articleNumber))); // The position of URL i - pos = this.mReader.readEightLittleEndianBytesValue(buffer); + long pos = this.mReader.readEightLittleEndianBytesLong(buffer); this.mReader.seek(pos); // Article or Redirect entry? - mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer); + mimeType = this.mReader.readTwoLittleEndianBytesInt(buffer); if (mimeType == 65535) { this.mReader.seek(pos + 12); - final String url = this.mReader.readString(); + final String url = this.mReader.readZeroTerminatedString(); returnList.add(url); } else { this.mReader.seek(pos + 16); - final String url = this.mReader.readString(); + final String url = this.mReader.readZeroTerminatedString(); returnList.add(url); } @@ -137,14 +201,69 @@ public class ZIMReader { return returnList; } + // position must be the seek position for the title in the Title Pointer List + private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException { + + // Helpers + final byte[] buffer = new byte[8]; + + // At the appropriate position in the titlePtrPos + this.mReader.seek(position); + + // Get value of article at index + int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt(buffer); + + // Move to the position in urlPtrPos + this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer); + + // Get value of article in urlPtrPos + long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong(buffer); + + // Go to the location of the directory entry + this.mReader.seek(pointer_to_the_directory_entry); + + // read the Content Entry + final int type = this.mReader.readTwoLittleEndianBytesInt(buffer); // 2, 0xffff for redirect + this.mReader.read(); // 1, ignore, parameter length not used + final char namespace = (char) this.mReader.read(); // 1 + this.mReader.readFourLittleEndianBytesInt(buffer); // 4, ignore, revision not used + + // Article or Redirect entry + if (type == 65535) { + final int redirectIndex = this.mReader.readFourLittleEndianBytesInt(buffer); + final String url = this.mReader.readZeroTerminatedString(); + String title = this.mReader.readZeroTerminatedString(); + title = title.equals("") ? url : title; + return new RedirectEntry(type, namespace, redirectIndex, + url, title, (position - this.mFile.header_urlPtrPos) / 8); + } else { + final int cluster_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4 + final int blob_number = this.mReader.readFourLittleEndianBytesInt(buffer); // 4 + final String url = this.mReader.readZeroTerminatedString(); // zero terminated + String title = this.mReader.readZeroTerminatedString(); // zero terminated + title = title.equals("") ? url : title; + + return new ArticleEntry( + type, namespace, + cluster_number, blob_number, + url, title, (position - this.mFile.header_urlPtrPos) / 8); + } + + } + + public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException { + if (entryNumber >= this.mFile.header_entryCount) throw new IOException("entryNumber exceeds entryCount"); + return getDirectoryInfoAtTitlePosition(this.mFile.header_titlePtrPos + 4 * entryNumber); + } + // Gives the minimum required information needed for the given articleName - public DirectoryEntry getDirectoryInfo(String articleName, final char namespace) - throws IOException { + // This makes a binary search on the article name entry list. + public DirectoryEntry getDirectoryInfo(final char namespace, String articleName) throws IOException { DirectoryEntry entry; String cmpStr; - final int numberOfArticles = this.mFile.getArticleCount(); - int beg = this.mFile.getTitlePtrPos(), end = beg + (numberOfArticles * 4), mid; + final int numberOfArticles = this.mFile.header_entryCount; + long beg = this.mFile.header_titlePtrPos, end = beg + (numberOfArticles * 4), mid; articleName = namespace + "/" + articleName; @@ -154,7 +273,7 @@ public class ZIMReader { if (entry == null) { return null; } - cmpStr = entry.getNamespace() + "/" + entry.getUrl(); + cmpStr = entry.namespace + "/" + entry.url; if (articleName.compareTo(cmpStr) < 0) { end = mid - 4; @@ -167,242 +286,130 @@ public class ZIMReader { } return null; - } - public ByteArrayOutputStream getArticleData(final String articleName, final char namespace) throws IOException { - - // search in the cache first, if not found, then call getDirectoryInfo(articleName) - - byte[] buffer = new byte[8]; - - final DirectoryEntry mainEntry = getDirectoryInfo(articleName, namespace); - - if (mainEntry != null) { - - // Check what kind of an entry was mainEnrty - if (mainEntry.getClass() == ArticleEntry.class) { - - // Cast to ArticleEntry - final ArticleEntry article = (ArticleEntry) mainEntry; - - // Get the cluster and blob numbers from the article - final int clusterNumber = article.getClusterNumber(); - final int blobNumber = article.getBlobnumber(); - - // Move to the cluster entry in the clusterPtrPos - this.mReader.seek(this.mFile.getClusterPtrPos() + clusterNumber * 8); - - // Read the location of the cluster - final int clusterPos = this.mReader - .readEightLittleEndianBytesValue(buffer); - - // Move to the cluster - this.mReader.seek(clusterPos); + public ByteArrayOutputStream getArticleData(final DirectoryEntry directoryInfo) throws IOException { - // Read the first byte, for compression information - final int compressionType = this.mReader.read(); + // fail fast + if (directoryInfo == null) return null; + if (directoryInfo.getClass() != ArticleEntry.class) return null; - // Reference declaration - SingleXZInputStream xzReader = null; - int firstOffset, numberOfBlobs, offset1, - offset2, - location, - differenceOffset; + // This is now an article, so thus we can cast to ArticleEntry + final ArticleEntry article = (ArticleEntry) directoryInfo; - ByteArrayOutputStream baos; + // Move to the cluster entry in the clusterPtrPos + this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8); - // Check the compression type that was read - switch (compressionType) { - - // TODO: Read uncompressed data directly - case 0: - case 1: - - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - - // Read the first offset - this.mReader.read(buffer); - - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); - - // The number of blobs - numberOfBlobs = firstOffset / 4; - - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; - - - if (blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - - location = (blobNumber - 1) * 4; - Utilities.skipFully(this.mReader, location); - this.mReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } - - this.mReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); - - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - - Utilities.skipFully(this.mReader, - (offset1 - 4 * (blobNumber + 2))); - - this.mReader.read(buffer, 0, differenceOffset); - - baos = new ByteArrayOutputStream(); - baos.write(buffer, 0, differenceOffset); - - return baos; - - // LZMA2 compressed data - case 4: - - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - xzReader = new SingleXZInputStream(this.mReader, 4194304); - - // Read the first offset - xzReader.read(buffer); + // Read the location of the cluster + byte[] buffer = new byte[8]; + final long clusterPos = this.mReader.readEightLittleEndianBytesLong(buffer); - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); + // Move to the cluster + this.mReader.seek(clusterPos); - // The number of blobs - numberOfBlobs = firstOffset / 4; + // Read the first byte, for compression information + final int compressionType = this.mReader.read(); - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; + // Reference declaration + SingleXZInputStream xzReader = null; + int firstOffset, numberOfBlobs, offset1, + offset2, + location, + differenceOffset; - if(blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { + ByteArrayOutputStream baos; - location = (blobNumber - 1) * 4; - Utilities.skipFully(xzReader, location); - xzReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } + // Check the compression type that was read + switch (compressionType) { - xzReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); + // TODO: Read uncompressed data directly + case 0: + case 1: - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; + // Read the first 4 bytes to find out the number of artciles + buffer = new byte[4]; - Utilities.skipFully(xzReader, - (offset1 - 4 * (blobNumber + 2))); + // Create a dictionary with size 40MiB, the zimlib uses this + // size while creating - xzReader.read(buffer, 0, differenceOffset); + // Read the first offset + this.mReader.read(buffer); - baos = new ByteArrayOutputStream(); - baos.write(buffer, 0, differenceOffset); + // The first four bytes are the offset of the zeroth blob + firstOffset = Utilities.toFourLittleEndianInteger(buffer); - return baos; + // The number of blobs + numberOfBlobs = firstOffset / 4; - } + // The blobNumber has to be lesser than the numberOfBlobs + assert article.blob_number < numberOfBlobs; + if (article.blob_number == 0) { + // The first offset is what we read earlier + offset1 = firstOffset; + } else { + location = (article.blob_number - 1) * 4; + Utilities.skipFully(this.mReader, location); + this.mReader.read(buffer); + offset1 = Utilities.toFourLittleEndianInteger(buffer); } - } - - return null; - - } - - public DirectoryEntry getDirectoryInfoAtTitlePosition(final int position) - throws IOException { - - // Helpers - int pos; - final byte[] buffer = new byte[8]; - - // At the appropriate position in the titlePtrPos - this.mReader.seek(position); - - // Get value of article at index - pos = this.mReader.readFourLittleEndianBytesValue(buffer); - - // Move to the position in urlPtrPos - this.mReader.seek(this.mFile.getUrlPtrPos() + 8 * pos); - - // Get value of article in urlPtrPos - pos = this.mReader.readEightLittleEndianBytesValue(buffer); - - // Go to the location of the directory entry - this.mReader.seek(pos); - - final int type = this.mReader.readTwoLittleEndianBytesValue(buffer); - - // Ignore the parameter length - this.mReader.read(); - - final char namespace = (char) this.mReader.read(); - // System.out.println("Namepsace: " + namespace); - final int revision = this.mReader.readFourLittleEndianBytesValue(buffer); - // System.out.println("Revision: " + revision); + this.mReader.read(buffer); + offset2 = Utilities.toFourLittleEndianInteger(buffer); + differenceOffset = offset2 - offset1; + buffer = new byte[differenceOffset]; + Utilities.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2))); + this.mReader.read(buffer, 0, differenceOffset); + baos = new ByteArrayOutputStream(); + baos.write(buffer, 0, differenceOffset); - // TODO: Remove redundant if condition code - // Article or Redirect entry - if (type == 65535) { - - // System.out.println("MIMEType: " + type); - - final int redirectIndex = this.mReader.readFourLittleEndianBytesValue(buffer); - // System.out.println("RedirectIndex: " + redirectIndex); + return baos; - final String url = this.mReader.readString(); - // System.out.println("URL: " + url); + // 2 for zlib and 3 for bzip2 (removed) - String title = this.mReader.readString(); - title = title.equals("") ? url : title; - // System.out.println("Title: " + title); - - return new RedirectEntry(type, namespace, revision, redirectIndex, - url, title, (position - this.mFile.getUrlPtrPos()) / 8); + // LZMA2 compressed data + case 4: - } else { + // Read the first 4 bytes to find out the number of artciles + buffer = new byte[4]; - // System.out.println("MIMEType: " + mFile.getMIMEType(type)); + // Create a dictionary with size 40MiB, the zimlib uses this size while creating + xzReader = new SingleXZInputStream(this.mReader, 4194304); - final int clusterNumber = this.mReader.readFourLittleEndianBytesValue(buffer); - // System.out.println("Cluster Number: " + clusterNumber); + // Read the first offset + xzReader.read(buffer); - final int blobNumber = this.mReader.readFourLittleEndianBytesValue(buffer); - // System.out.println("Blob Number: " + blobNumber); + // The first four bytes are the offset of the zeroth blob + firstOffset = Utilities.toFourLittleEndianInteger(buffer); - final String url = this.mReader.readString(); - // System.out.println("URL: " + url); + // The number of blobs + numberOfBlobs = firstOffset / 4; - String title = this.mReader.readString(); - title = title.equals("") ? url : title; - // System.out.println("Title: " + title); - - // Parameter data ignored + // The blobNumber has to be lesser than the numberOfBlobs + assert article.blob_number < numberOfBlobs; + if (article.blob_number == 0) { + // The first offset is what we read earlier + offset1 = firstOffset; + } else { + location = (article.blob_number - 1) * 4; + Utilities.skipFully(xzReader, location); + xzReader.read(buffer); + offset1 = Utilities.toFourLittleEndianInteger(buffer); + } - return new ArticleEntry(type, namespace, revision, clusterNumber, - blobNumber, url, title, - (position - this.mFile.getUrlPtrPos()) / 8); + xzReader.read(buffer); + offset2 = Utilities.toFourLittleEndianInteger(buffer); + differenceOffset = offset2 - offset1; + buffer = new byte[differenceOffset]; + Utilities.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2))); + xzReader.read(buffer, 0, differenceOffset); + baos = new ByteArrayOutputStream(); + baos.write(buffer, 0, differenceOffset); + return baos; + + // case 5: zstd compressed (missing!) + default: + return null; } - } - public ZIMFile getZIMFile() { - return this.mFile; - } } diff --git a/source/org/openzim/ZIMTest.java b/source/org/openzim/ZIMTest.java index 6d8ed64fb..24b9cf9be 100644 --- a/source/org/openzim/ZIMTest.java +++ b/source/org/openzim/ZIMTest.java @@ -18,27 +18,49 @@ package org.openzim; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.openzim.ZIMReader.DirectoryEntry; public class ZIMTest { + public static void main(final String[] args) { - if(args.length!=2) { - System.out.println("Usage: java ZIMTest "); + if(args.length!=1) { + System.out.println("Usage: java ZIMTest "); System.exit(0); } - // args[0] is the Zim File's location - final ZIMFile file = new ZIMFile(args[0]); + try { + // args[0] is the Zim File's location + final ZIMFile file = new ZIMFile(args[0]); + + // Associate the Zim File with a Reader + final ZIMReader zReader = new ZIMReader(file); - // Associate the Zim File with a Reader - final ZIMReader zReader = new ZIMReader(file); + // print a list of urls and titles + final List urls = zReader.getURLListByURL(); + final List titles = zReader.getURLListByTitle(); + int c = Math.min(10, titles.size()); + for (int i = 0; i < c; i++) { + System.out.println("URL by URL " + i + ": " + urls.get(i)); + System.out.println("URL by Title " + i + ": " + titles.get(i)); + DirectoryEntry entry = zReader.getDirectoryInfo(i); + System.out.println("URL by Pos " + i + ": " + entry.url); + System.out.println("Title by Pos " + i + ": " + entry.title); + System.out.println("Namespace by Pos " + i + ": " + entry.namespace); + } - try { - // args[1] is the name of the articles that is - // to be fetched - System.out.println(zReader.getArticleData(args[1],'A').toString("utf-8")); + // print article c-1 + DirectoryEntry directory_entry = zReader.getDirectoryInfo(c - 1); + ByteArrayOutputStream articleStream = zReader.getArticleData(directory_entry); + String article = articleStream == null ? "NULL" : articleStream.toString(StandardCharsets.UTF_8.name()); + System.out.println(article); } catch (final IOException e) { e.printStackTrace(); } } + }