/* * Copyright (C) 2011 Arunesh Mathur * * This file is a part of zimreader-java. * * zimreader-java is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License version 3.0 as * published by the Free Software Foundation. * * zimreader-java is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with zimreader-java. If not, see . */ package org.openzim; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.tukaani.xz.SingleXZInputStream; import com.github.luben.zstd.ZstdInputStream; /** * @author Arunesh Mathur * A ZIMReader that reads data from the ZIMFile * * @author Michael Christen * Proof-Reading, unclustering, refactoring, * naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format, * change of Exception handling, * extension to more attributes as defined in spec (bugfix for mime type loading) * bugfix to long parsing (prevented reading of large files), * added extended cluster size parsing * added ZStandard compression parsing (cluster type 5) * added cluster index */ public class ZIMReader { private final ZIMFile mFile; public class DirectoryEntry { private final int mimetype; public final char namespace; public final String url; public final String title; public final long urlListindex; public DirectoryEntry( final long urlListindex, final char namespace, final String url, final String title, final int mimeType) { assert url != null; assert title != null; this.mimetype = mimeType; this.namespace = namespace; this.url = url; this.title = title; this.urlListindex = urlListindex; } public String getMimeType() { return mFile.getMimeType(this.mimetype); } } public class ArticleEntry extends DirectoryEntry { public final int cluster_number; public final int blob_number; public ArticleEntry( final long urlListindex, final char namespace, final String url, final String title, final int mimeType, final int cluster_number, final int blob_number) { super(urlListindex, namespace, url, title, mimeType); this.cluster_number = cluster_number; this.blob_number = blob_number; } } public class RedirectEntry extends DirectoryEntry { public final long redirect_index; public RedirectEntry( final long urlListindex, final char namespace, final String url, final String title, final int mimeType, final long redirect_index) { super(urlListindex, namespace, url, title, mimeType); this.redirect_index = redirect_index; } } public class ArticleBlobEntry { public final ArticleEntry article; public final byte[] blob; public ArticleBlobEntry(final ArticleEntry article, final byte[] blob) { assert article != null; assert blob != null; this.article = article; this.blob = blob; } } public ZIMReader(final ZIMFile file) { this.mFile = file; } public ZIMFile getZIMFile() { return this.mFile; } public List getAllArticles() throws IOException { List list = new ArrayList<>(); for (int i = 0; i < this.mFile.header_entryCount; i++) { DirectoryEntry de = getDirectoryInfo(i); if (de instanceof ArticleEntry) list.add((ArticleEntry) de); } return list; } public Map> getIndexedArticles(List list) { Map> index = new HashMap<>(); for (ArticleEntry entry: list) { Map cluster = index.get(entry.cluster_number); if (cluster == null) { cluster = new HashMap(); index.put(entry.cluster_number, cluster); } cluster.put(entry.blob_number, entry); } return index; } public class ClusterIterator implements Iterator { private Map> index; private Cluster cluster; private int clusterCounter; private int blobCounter; public ClusterIterator() throws IOException { List list = getAllArticles(); this.index = getIndexedArticles(list); this.clusterCounter = 0; this.blobCounter = 0; this.cluster = null; // not loaded } private final void loadCluster() { if (this.cluster == null) { // load cluster try { this.cluster = new Cluster(this.clusterCounter); } catch (IOException e) { e.printStackTrace(); } } } @Override public boolean hasNext() { if (this.clusterCounter >= mFile.header_clusterCount) return false; loadCluster(); // ensure cluster is loaded return this.blobCounter < this.cluster.blobs.size(); } @Override public ArticleBlobEntry next() { Map clusterMap = this.index.get(this.clusterCounter); ArticleEntry ae = clusterMap.get(this.blobCounter); loadCluster(); // ensure cluster is loaded ArticleBlobEntry abe = new ArticleBlobEntry(ae, this.cluster.blobs.get(this.blobCounter)); // increase the counter(s) this.blobCounter++; if (this.blobCounter >= this.cluster.blobs.size()) { this.clusterCounter++; this.cluster = null; // unload cluster this.blobCounter = 0; } return abe; } } public String getURLByURLOrder(final int entryNumber) throws IOException { // The position of URL i long pos = this.mFile.getURLPtr(entryNumber); this.mFile.mReader.seek(pos); // Article or Redirect entry? int mimeType = this.mFile.mReader.readTwoLittleEndianBytesInt(); if (mimeType == 65535) { this.mFile.mReader.seek(pos + 12); return this.mFile.mReader.readZeroTerminatedString(); } else { this.mFile.mReader.seek(pos + 16); return this.mFile.mReader.readZeroTerminatedString(); } } public String getURLByTitleOrder(final int entryNumber) throws IOException { // The articleNumber of the position of URL i int articleNumber = this.mFile.getTitlePtr(entryNumber); return getURLByURLOrder(articleNumber); } public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException { // Get value of article at index int pointer_to_the_URL_pointer = this.mFile.getTitlePtr(entryNumber); // Get value of article in urlPtrPos long pointer_to_the_directory_entry = this.mFile.getURLPtr(pointer_to_the_URL_pointer); // Go to the location of the directory entry this.mFile.mReader.seek(pointer_to_the_directory_entry); // read the Content Entry final int type = this.mFile.mReader.readTwoLittleEndianBytesInt(); // 2, 0xffff for redirect this.mFile.mReader.read(); // 1, ignore, parameter length not used final char namespace = (char) this.mFile.mReader.read(); // 1 this.mFile.mReader.readFourLittleEndianBytesInt(); // 4, ignore, revision not used // Article or Redirect entry if (type == 65535) { final int redirectIndex = this.mFile.mReader.readFourLittleEndianBytesInt(); final String url = this.mFile.mReader.readZeroTerminatedString(); String title = this.mFile.mReader.readZeroTerminatedString(); title = title.equals("") ? url : title; return new RedirectEntry(entryNumber, namespace, url, title, type, redirectIndex); } else { final int cluster_number = this.mFile.mReader.readFourLittleEndianBytesInt(); // 4 final int blob_number = this.mFile.mReader.readFourLittleEndianBytesInt(); // 4 final String url = this.mFile.mReader.readZeroTerminatedString(); // zero terminated String title = this.mFile.mReader.readZeroTerminatedString(); // zero terminated title = title.equals("") ? url : title; return new ArticleEntry(entryNumber, namespace, url, title, type, cluster_number, blob_number); } } // Gives the minimum required information needed for the given articleName // This makes a binary search on the article name entry list. public DirectoryEntry getDirectoryInfo(final char namespace, String articleName) throws IOException { DirectoryEntry entry; String cmpStr; final int numberOfArticles = this.mFile.header_entryCount; int beg = 0, end = numberOfArticles, mid; articleName = namespace + "/" + articleName; while (beg <= end) { mid = beg + ((end - beg) / 2); entry = getDirectoryInfo(mid); if (entry == null) { return null; } cmpStr = entry.namespace + "/" + entry.title; if (articleName.compareTo(cmpStr) < 0) { end = mid - 1; } else if (articleName.compareTo(cmpStr) > 0) { beg = mid + 1; } else { return entry; } } return null; } /** * Cluster class is required to read a whole cluster with all documents inside at once. * This is a good thing because reading single documents from a cluster requires that the * cluster is decompressed every time again and again. Doing whole clusters with all documents * at once means that the decompression is much more efficient because it is done only once. * This can of course only be done, if: * - we want to iterate through all documents of a ZIM file * - we have reverse indexed all directory entries to be able to assign metadata to cluster documents */ private class Cluster { private List blobs; private boolean extended; public Cluster(int cluster_number) throws IOException { // open the cluster and make a Input Stream with the proper decompression type final long clusterPos = mFile.geClusterPtr(cluster_number); mFile.mReader.seek(clusterPos); final int compressionType = mFile.mReader.read(); InputStream is = null; if (compressionType <= 1 || compressionType == 8 || compressionType == 9) { extended = compressionType > 1; is = mFile.mReader; } if (compressionType == 4 || compressionType == 12) { extended = compressionType == 12; is = new SingleXZInputStream(mFile.mReader, 41943040); } if (compressionType == 5 || compressionType == 13) { extended = compressionType == 13; is = new ZstdInputStream(mFile.mReader); } if (is == null) throw new IOException("compression type unknown: " + compressionType); // read the offset list List offsets = new ArrayList<>(); byte[] buffer = new byte[extended ? 8 : 4]; is.read(buffer); long end_offset = extended ? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); offsets.add(end_offset); int offset_count = (int) ((end_offset - 1) / (extended ? 8 : 4)); for (int i = 0; i < offset_count - 1; i++) { long l = extended ? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); offsets.add(l); } // now all document sizes are known because they are defined by the offset deltas // the seek position should be now at the beginning of the first document this.blobs = new ArrayList<>(); for (int i = 0; i < offsets.size() - 1; i++) { // loop until the size - 1 because the last offset is the end of the last document int length = (int) (offsets.get(i + 1) + offsets.get(i)); // yes the maximum document length is 2GB, for now byte[] b = new byte[length]; RandomAccessFileZIMInputStream.readFully(is, b); this.blobs.add(b); } } } public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException { // fail fast if (directoryInfo == null) return null; if (directoryInfo.getClass() != ArticleEntry.class) return null; // This is now an article, so thus we can cast to ArticleEntry final ArticleEntry article = (ArticleEntry) directoryInfo; // Read the location of the cluster final long clusterPos = this.mFile.geClusterPtr(article.cluster_number); // Move to the cluster this.mFile.mReader.seek(clusterPos); // Read the first byte, for compression information final int compressionType = this.mFile.mReader.read(); // Check the compression type that was read // type = 1 uncompressed if (compressionType <= 1 || compressionType == 8 || compressionType == 9) { boolean extended = compressionType > 1; return readClusterEntry(this.mFile.mReader, article.blob_number, extended); } // 2 for zlib and 3 for bzip2 (removed) // LZMA2 compressed data if (compressionType == 4 || compressionType == 12) { boolean extended = compressionType == 12; // Create a dictionary with size 40MiB, the zimlib uses this size while creating SingleXZInputStream xzReader= new SingleXZInputStream(this.mFile.mReader, 41943040); return readClusterEntry(xzReader, article.blob_number, extended); } // Zstandard compressed data if (compressionType == 5 || compressionType == 13) { boolean extended = compressionType == 13; ZstdInputStream zReader = new ZstdInputStream(this.mFile.mReader); return readClusterEntry(zReader, article.blob_number, extended); } return null; } private static byte[] readClusterEntry(InputStream is, int blob_number, boolean extended) throws IOException { // Read the first 4(8) bytes to find out the number of articles byte[] buffer = new byte[extended ? 8 : 4]; // The first four (eight) bytes are the offset of the zeroth blob is.read(buffer); long firstOffset = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); // The number of blobs can be computed by the offset // the actual number is one less because there is one more offset entry than the actual number // to identify the end of the last blob. long numberOfBlobs1 = extended ? firstOffset / 8 : firstOffset / 4; // The blobNumber has to be lesser than the numberOfBlobs - 1 // the blob numbers start with 0 even if the documentation states it is "the first blob". assert blob_number < numberOfBlobs1 - 1; long offset1; if (blob_number == 0) { // The first offset is what we read earlier offset1 = firstOffset; } else { // skip one less than required to get to the offset entry because the first entry is already read RandomAccessFileZIMInputStream.skipFully(is, (blob_number - 1) * (extended ? 8 : 4)); is.read(buffer); offset1 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); } is.read(buffer); long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); long blob_size = offset2 - offset1; byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT // we must do two skip steps: first to the end of the offset list and second to the start of the blob // - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset // - the number of offset entries that we alreay read now is article.blob_number + 2 (in any case at least 2) // - the remaining number of offset entries to skip is therefore numberOfBlobs1 - (article.blob_number + 2) // - the addon skip of number of bytes to the start of the entry is offset1 - firstoffset with firstoffset = 4 * numberOfBlobs1 // - the full skip length is 4 * (numberOfBlobs1 - (article.blob_number + 2)) + offset1 - 4 * numberOfBlobs1 // = offset1 - 4 * (article.blob_number + 2) RandomAccessFileZIMInputStream.skipFully(is, (offset1 - (extended ? 8 : 4) * (blob_number + 2))); RandomAccessFileZIMInputStream.readFully(is, entry); return entry; } }