yacy_search_server/source/org/openzim/ZIMReader.java

/*
 * Copyright (C) 2011 Arunesh Mathur
 *
 * This file is a part of zimreader-java.
 *
 * zimreader-java is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License version 3.0 as
 * published by the Free Software Foundation.
 *
 * zimreader-java is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with zimreader-java.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.openzim;

import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.List;

import org.tukaani.xz.SingleXZInputStream;

/**
 * @author Arunesh Mathur
 *         A ZIMReader that reads data from the ZIMFile
 *
 * @author Michael Christen
 *         Proof-Reading, unclustering, refactoring,
 *         naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
 *         change of Exception handling, 
 *         extension to more attributes as defined in spec (bugfix for mime type loading)
 *         bugfix to long parsing (prevented reading of large files)
 */
public class ZIMReader {

    private final ZIMFile mFile;
    private RandomAccessFileZIMInputStream mReader;

    public static abstract class DirectoryEntry {

        public final int mimetype;
        public final char namespace;
        public final int cluster_number;
        public final String url;
        public final String title;
        public final long urlListindex;

        public DirectoryEntry(
                final int mimeType, final char namespace,
                final int cluster_number,
                final String url, final String title,
                final long index) {
            this.mimetype = mimeType;
            this.namespace = namespace;
            this.cluster_number = cluster_number;
            this.url = url;
            this.title = title;
            this.urlListindex = index;
        }

    }

    public static class ArticleEntry extends DirectoryEntry {

        public final int cluster_number;
        public final int blob_number;

        public ArticleEntry(
                final int mimeType, final char namespace,
                final int cluster_number, final int blob_number,
                final String url, final String title,
                final long urlListindex) {
            super(mimeType, namespace, cluster_number, url, title, urlListindex);
            this.cluster_number = cluster_number;
            this.blob_number = blob_number;
        }

    }

    public static class RedirectEntry extends DirectoryEntry {

        public final long redirect_index;

        public RedirectEntry(final int mimeType, final char namespace,
                final long redirect_index, final String url, final String title,
                final long urlListindex) {
            super(mimeType, namespace, 0, url, title, urlListindex);
            this.redirect_index = redirect_index;
        }

    }

    public ZIMReader(final ZIMFile file) {
        this.mFile = file;
        try {
            this.mReader = new RandomAccessFileZIMInputStream(new RandomAccessFile(this.mFile, "r"));
        } catch (final FileNotFoundException e) {
            e.printStackTrace();
        }
    }

    public ZIMFile getZIMFile() {
        return this.mFile;
    }

    // get a URL list that is sorted by the urls
    public List<String> getURLListByURL() throws IOException {

        int i = 0, mimeType;

        // The list that will eventually return the list of URL's
        final ArrayList<String> returnList = new ArrayList<>();

        // Move to the spot where URL's are listed
        this.mReader.seek(this.mFile.header_urlPtrPos);

        for (i = 0; i < this.mFile.header_entryCount; i++) {

            // The position of URL i
            long pos = this.mReader.readEightLittleEndianBytesLong();

            // Mark the current position that we need to return to
            this.mReader.mark();

            // Move to the position of URL i
            this.mReader.seek(pos);

            // Article or Redirect entry?
            mimeType = this.mReader.readTwoLittleEndianBytesInt();

            if (mimeType == 65535) {
                this.mReader.seek(pos + 12);
                returnList.add(this.mReader.readZeroTerminatedString());
            } else {
                this.mReader.seek(pos + 16);
                returnList.add(this.mReader.readZeroTerminatedString());
            }

            this.mReader.reset();
        }

        return returnList;
    }

    // get a URL list that is sorted by the entry titles
    public List<String> getURLListByTitle() throws IOException {

        int i = 0, mimeType, articleNumber;

        // The list that will eventually return the list of URL's
        final ArrayList<String> returnList = new ArrayList<>();

        // Get the UrlPtrPos or one time storage
        long urlPtrPos = this.mFile.header_urlPtrPos;

        // Move to the spot where URL's are listed
        this.mReader.seek(this.mFile.header_titlePtrPos);

        for (i = 0; i < this.mFile.header_entryCount; i++) {

            // The articleNumber of the position of URL i
            articleNumber = this.mReader.readFourLittleEndianBytesInt();

            // Mark the current position that we need to return to
            this.mReader.mark();

            this.mReader.seek(urlPtrPos + (8L * (articleNumber)));

            // The position of URL i
            long pos = this.mReader.readEightLittleEndianBytesLong();
            this.mReader.seek(pos);

            // Article or Redirect entry?
            mimeType = this.mReader.readTwoLittleEndianBytesInt();

            if (mimeType == 65535) {
                this.mReader.seek(pos + 12);
                final String url = this.mReader.readZeroTerminatedString();
                returnList.add(url);
            } else {
                this.mReader.seek(pos + 16);
                final String url = this.mReader.readZeroTerminatedString();
                returnList.add(url);
            }

            // Return to the marked position
            this.mReader.reset();
        }

        return returnList;
    }

    // position must be the seek position for the title in the Title Pointer List
    private DirectoryEntry getDirectoryInfoAtTitlePosition(final long position) throws IOException {

        // At the appropriate position in the titlePtrPos
        this.mReader.seek(position);

        // Get value of article at index
        int pointer_to_the_URL_pointer = this.mReader.readFourLittleEndianBytesInt();

        // Move to the position in urlPtrPos
        this.mReader.seek(this.mFile.header_urlPtrPos + 8 * pointer_to_the_URL_pointer);

        // Get value of article in urlPtrPos
        long pointer_to_the_directory_entry = this.mReader.readEightLittleEndianBytesLong();

        // Go to the location of the directory entry
        this.mReader.seek(pointer_to_the_directory_entry);

        // read the Content Entry
        final int type = this.mReader.readTwoLittleEndianBytesInt(); // 2, 0xffff for redirect
        this.mReader.read();                                         // 1, ignore, parameter length not used
        final char namespace = (char) this.mReader.read();           // 1
        this.mReader.readFourLittleEndianBytesInt();                 // 4, ignore, revision not used

        // Article or Redirect entry
        if (type == 65535) {
            final int redirectIndex = this.mReader.readFourLittleEndianBytesInt();
            final String url = this.mReader.readZeroTerminatedString();
            String title = this.mReader.readZeroTerminatedString();
            title = title.equals("") ? url : title;
            return new RedirectEntry(type, namespace, redirectIndex,
                    url, title, (position - this.mFile.header_urlPtrPos) / 8);
        } else {
            final int cluster_number = this.mReader.readFourLittleEndianBytesInt(); // 4
            final int blob_number = this.mReader.readFourLittleEndianBytesInt();    // 4
            final String url = this.mReader.readZeroTerminatedString();             // zero terminated
            String title = this.mReader.readZeroTerminatedString();                 // zero terminated
            title = title.equals("") ? url : title;

            return new ArticleEntry(
                    type, namespace,
                    cluster_number, blob_number,
                    url, title, (position - this.mFile.header_urlPtrPos) / 8);
        }

    }

    public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException {
        if (entryNumber >= this.mFile.header_entryCount) throw new IOException("entryNumber exceeds entryCount");
        return getDirectoryInfoAtTitlePosition(this.mFile.header_titlePtrPos + 4 * entryNumber);
    }

    // Gives the minimum required information needed for the given articleName
    // This makes a binary search on the article name entry list.
    public DirectoryEntry getDirectoryInfo(final char namespace, String articleName) throws IOException {

        DirectoryEntry entry;
        String cmpStr;
        final int numberOfArticles = this.mFile.header_entryCount;
        long beg = this.mFile.header_titlePtrPos, end = beg + (numberOfArticles * 4), mid;

        articleName = namespace + "/" + articleName;

        while (beg <= end) {
            mid = beg + 4 * (((end - beg) / 4) / 2);
            entry = getDirectoryInfoAtTitlePosition(mid);
            if (entry == null) {
                return null;
            }
            cmpStr = entry.namespace + "/" + entry.url;
            if (articleName.compareTo(cmpStr) < 0) {
                end = mid - 4;

            } else if (articleName.compareTo(cmpStr) > 0) {
                beg = mid + 4;

            } else {
                return entry;
            }
        }

        return null;
    }

    public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {

        // fail fast
        if (directoryInfo == null) return null;
        if (directoryInfo.getClass() != ArticleEntry.class) return null;

        // This is now an article, so thus we can cast to ArticleEntry
        final ArticleEntry article = (ArticleEntry) directoryInfo;

        // Move to the cluster entry in the clusterPtrPos
        this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8);

        // Read the location of the cluster
        final long clusterPos = this.mReader.readEightLittleEndianBytesLong();

        // Move to the cluster
        this.mReader.seek(clusterPos);

        // Read the first byte, for compression information
        final int compressionType = this.mReader.read();

        // Reference declaration
        int firstOffset, numberOfBlobs, offset1, offset2, location, differenceOffset;

        // Check the compression type that was read
        if (compressionType == 1) {

            // The first four bytes are the offset of the zeroth blob
            firstOffset = this.mReader.readFourLittleEndianBytesInt();

            // The number of blobs
            numberOfBlobs = firstOffset / 4;

            // The blobNumber has to be lesser than the numberOfBlobs
            assert article.blob_number < numberOfBlobs;
            if (article.blob_number == 0) {
                // The first offset is what we read earlier
                offset1 = firstOffset;
            } else {
                location = (article.blob_number - 1) * 4;
                RandomAccessFileZIMInputStream.skipFully(this.mReader, location);
                offset1 = this.mReader.readFourLittleEndianBytesInt();
            }

            offset2 = this.mReader.readFourLittleEndianBytesInt();
            differenceOffset = offset2 - offset1;
            byte[] entry = new byte[differenceOffset];
            RandomAccessFileZIMInputStream.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
            this.mReader.read(entry, 0, differenceOffset);

            return entry;
        }
        // 2 for zlib and 3 for bzip2 (removed)

        // LZMA2 compressed data
        if (compressionType == 4) {

            // Read the first 4 bytes to find out the number of artciles
            byte[] buffer = new byte[4];

            // Create a dictionary with size 40MiB, the zimlib uses this size while creating
            SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 4194304);

            // The first four bytes are the offset of the zeroth blob
            firstOffset = this.mReader.readFourLittleEndianBytesInt();

            // The number of blobs
            numberOfBlobs = firstOffset / 4;

            // The blobNumber has to be lesser than the numberOfBlobs
            assert article.blob_number < numberOfBlobs;
            if (article.blob_number == 0) {
                // The first offset is what we read earlier
                offset1 = firstOffset;
            } else {
                location = (article.blob_number - 1) * 4;
                RandomAccessFileZIMInputStream.skipFully(xzReader, location);
                xzReader.read(buffer);
                offset1 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
            }

            xzReader.read(buffer);
            offset2 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
            differenceOffset = offset2 - offset1;
            byte[] entry = new byte[differenceOffset];
            RandomAccessFileZIMInputStream.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
            xzReader.read(entry, 0, differenceOffset);

            return entry;
        }

        // case 5: zstd compressed (missing!)
        return null;
    }

}