From 1fefae9baff5ec54f2b4a7aadb47c1300c51cc30 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 27 Oct 2023 10:59:06 +0200 Subject: [PATCH] integrated the source code of a openzim file format reader. These are the raw format reader files with no integration in YaCy yet, which will maybe follow as a next step. The zim file format is documented in https://openzim.org and the reader code was taken from the archived, non-maintained repository at https://github.com/openzim/zimreader-java --- source/org/openzim/ArticleEntry.java | 46 ++ source/org/openzim/DirectoryEntry.java | 69 +++ .../RandomAcessFileZIMInputStream.java | 135 ++++++ source/org/openzim/RedirectEntry.java | 37 ++ source/org/openzim/Utilities.java | 84 ++++ source/org/openzim/ZIMFile.java | 198 +++++++++ source/org/openzim/ZIMReader.java | 408 ++++++++++++++++++ source/org/openzim/ZIMTest.java | 44 ++ 8 files changed, 1021 insertions(+) create mode 100644 source/org/openzim/ArticleEntry.java create mode 100644 source/org/openzim/DirectoryEntry.java create mode 100644 source/org/openzim/RandomAcessFileZIMInputStream.java create mode 100644 source/org/openzim/RedirectEntry.java create mode 100644 source/org/openzim/Utilities.java create mode 100644 source/org/openzim/ZIMFile.java create mode 100644 source/org/openzim/ZIMReader.java create mode 100644 source/org/openzim/ZIMTest.java diff --git a/source/org/openzim/ArticleEntry.java b/source/org/openzim/ArticleEntry.java new file mode 100644 index 000000000..7eeae2e06 --- /dev/null +++ b/source/org/openzim/ArticleEntry.java @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2011 Arunesh Mathur + * + * This file is a part of zimreader-java. + * + * zimreader-java is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3.0 as + * published by the Free Software Foundation. + * + * zimreader-java is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with zimreader-java. If not, see . + */ + + +package org.openzim; + +public class ArticleEntry extends DirectoryEntry { + + int clusterNumber; + + int blobnumber; + + public ArticleEntry(final int mimeType, final char namespace, final int revision, + final int clusterNumber, final int blobNumber, final String url, final String title, + final int urlListindex) { + + super(mimeType, namespace, revision, url, title, urlListindex); + + this.clusterNumber = clusterNumber; + this.blobnumber = blobNumber; + } + + public int getClusterNumber() { + return this.clusterNumber; + } + + public int getBlobnumber() { + return this.blobnumber; + } + +} diff --git a/source/org/openzim/DirectoryEntry.java b/source/org/openzim/DirectoryEntry.java new file mode 100644 index 000000000..92c52de41 --- /dev/null +++ b/source/org/openzim/DirectoryEntry.java @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2011 Arunesh Mathur + * + * This file is a part of zimreader-java. + * + * zimreader-java is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3.0 as + * published by the Free Software Foundation. + * + * zimreader-java is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with zimreader-java. If not, see . + */ + +package org.openzim; + +public abstract class DirectoryEntry { + + int mimeType; + + char namespace; + + int revision; + + String url; + + String title; + + int urlListindex; + + public DirectoryEntry(final int mimeType, final char namespace, final int revision, + final String url, final String title, final int index) { + this.mimeType = mimeType; + this.namespace = namespace; + this.revision = revision; + this.url = url; + this.title = title; + this.urlListindex = index; + } + + public int getMimeType() { + return this.mimeType; + } + + public char getNamespace() { + return this.namespace; + } + + public int getRevision() { + return this.revision; + } + + public String getUrl() { + return this.url; + } + + public String getTitle() { + return this.title; + } + + public int getUrlListindex() { + return this.urlListindex; + } + +} diff --git a/source/org/openzim/RandomAcessFileZIMInputStream.java b/source/org/openzim/RandomAcessFileZIMInputStream.java new file mode 100644 index 000000000..006dd4498 --- /dev/null +++ b/source/org/openzim/RandomAcessFileZIMInputStream.java @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2011 Arunesh Mathur + * + * This file is a part of zimreader-java. + * + * zimreader-java is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3.0 as + * published by the Free Software Foundation. + * + * zimreader-java is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with zimreader-java. If not, see . + */ + +package org.openzim; + +import java.io.IOException; +import java.io.InputStream; +import java.io.RandomAccessFile; + +/** + * This is an implementation of RandomAccessFile to ensure that it is an + * InputStream as well, specifically designed for reading a ZIM file. Ad-Hoc + * implementation, can be improved. + * + * @author Arunesh Mathur + */ + +public class RandomAcessFileZIMInputStream extends InputStream { + + private final RandomAccessFile mRAFReader; + + private long mMarked = -1; + + public RandomAcessFileZIMInputStream(final RandomAccessFile reader) { + this.mRAFReader = reader; + } + + // TODO: Remove the parameter buffer + public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException { + if (buffer.length < 2) { + throw new OutOfMemoryError("buffer too small"); + } else { + this.mRAFReader.read(buffer, 0, 2); + return Utilities.toTwoLittleEndianInteger(buffer); + } + } + + // TODO: Remove the parameter buffer + public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException { + if (buffer.length < 4) { + throw new OutOfMemoryError("buffer too small"); + } else { + this.mRAFReader.read(buffer, 0, 4); + return Utilities.toFourLittleEndianInteger(buffer); + } + } + + // TODO: Remove the parameter buffer + public int readEightLittleEndianBytesValue(final byte[] buffer) + throws IOException { + if (buffer.length < 8) { + throw new OutOfMemoryError("buffer too small"); + } else { + this.mRAFReader.read(buffer, 0, 8); + return Utilities.toEightLittleEndianInteger(buffer); + } + } + + // TODO: Remove the parameter buffer + public int readSixteenLittleEndianBytesValue(final byte[] buffer) + throws IOException { + if (buffer.length < 16) { + throw new OutOfMemoryError("buffer too small"); + } else { + this.mRAFReader.read(buffer, 0, 16); + return Utilities.toSixteenLittleEndianInteger(buffer); + } + } + + // Reads characters from the current position into a String and stops when a + // '\0' is encountered + public String readString() throws IOException { + final StringBuffer sb = new StringBuffer(); + /* + * int i; byte[] buffer = new byte[100]; while (true) { + * mRAFReader.read(buffer); for (i = 0; i < buffer.length; i++) { if + * (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i + * != buffer.length) break; } return sb.toString(); + */ + int b; + b = this.mRAFReader.read(); + while (b != '\0') { + sb.append((char) b); + b = this.mRAFReader.read(); + } + return sb.toString(); + + } + + @Override + public int read() throws IOException { + return this.mRAFReader.read(); + } + + public RandomAccessFile getRandomAccessFile() { + return this.mRAFReader; + } + + public void seek(final long pos) throws IOException { + this.mRAFReader.seek(pos); + } + + public long getFilePointer() throws IOException { + return this.mRAFReader.getFilePointer(); + } + + public void mark() throws IOException { + this.mMarked = this.mRAFReader.getFilePointer(); + } + + @Override + public void reset() throws IOException { + if (this.mMarked == -1) { + return; + } else { + this.mRAFReader.seek(this.mMarked); + this.mMarked = -1; + } + } +} diff --git a/source/org/openzim/RedirectEntry.java b/source/org/openzim/RedirectEntry.java new file mode 100644 index 000000000..fdbe3fba1 --- /dev/null +++ b/source/org/openzim/RedirectEntry.java @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2011 Arunesh Mathur + * + * This file is a part of zimreader-java. + * + * zimreader-java is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3.0 as + * published by the Free Software Foundation. + * + * zimreader-java is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with zimreader-java. If not, see . + */ + +package org.openzim; + +public class RedirectEntry extends DirectoryEntry { + + int redirectIndex; + + public RedirectEntry(final int mimeType, final char namespace, final int revision, + final int redirectIndex, final String url, final String title, final int urlListindex) { + + super(mimeType, namespace, revision, url, title, urlListindex); + + this.redirectIndex = redirectIndex; + } + + public int getRedirectIndex() { + return this.redirectIndex; + } + +} diff --git a/source/org/openzim/Utilities.java b/source/org/openzim/Utilities.java new file mode 100644 index 000000000..0de337c9c --- /dev/null +++ b/source/org/openzim/Utilities.java @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2011 Arunesh Mathur + * + * This file is a part of zimreader-java. + * + * zimreader-java is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3.0 as + * published by the Free Software Foundation. + * + * zimreader-java is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with zimreader-java. If not, see . + */ + + +package org.openzim; + +import java.io.IOException; +import java.io.InputStream; + +public class Utilities { + + // TODO: Write a binary search algorithm + public static int binarySearch() { + return -1; + } + + public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException { + if (buffer.length < 2) { + throw new OutOfMemoryError("buffer too small"); + } else { + final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)); + return result; + } + } + + public static int toFourLittleEndianInteger(final byte[] buffer) throws IOException { + if (buffer.length < 4) { + throw new OutOfMemoryError("buffer too small"); + } else { + final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) + | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)); + return result; + } + } + + public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException { + if (buffer.length < 8) { + throw new OutOfMemoryError("buffer too small"); + } else { + final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) + | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24) + | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40) + | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)); + return result; + } + } + + public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException { + if (buffer.length < 16) { + throw new OutOfMemoryError("buffer too small"); + } else { + final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8) + | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24) + | ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40) + | ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56) + | ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72) + | ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88) + | ((buffer[12] & 0xFF) << 96) + | ((buffer[13] & 0xFF) << 104) + | ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120)); + return result; + } + } + + public static void skipFully(final InputStream stream, final long bytes) throws IOException { + for (long i = stream.skip(bytes); i < bytes; i += stream.skip(bytes - i)); + } + +} diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java new file mode 100644 index 000000000..c86119be1 --- /dev/null +++ b/source/org/openzim/ZIMFile.java @@ -0,0 +1,198 @@ +/* + * Copyright (C) 2011 Arunesh Mathur + * + * This file is a part of zimreader-java. + * + * zimreader-java is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3.0 as + * published by the Free Software Foundation. + * + * zimreader-java is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with zimreader-java. If not, see . + */ + +package org.openzim; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.RandomAccessFile; +import java.util.ArrayList; +import java.util.List; + +/** + * @author Arunesh Mathur + * + * A ZIM file implementation that stores the Header and the MIMETypeList + * + */ +public class ZIMFile extends File { + + /** + * + */ + private static final long serialVersionUID = 1L; + + private Header mHeader; + + private List mMIMETypeList; // Can be removed if not needed + + public ZIMFile(final String path) { + super(path); + + try { + readHeader(); + } catch (final FileNotFoundException e) { + e.printStackTrace(); + } + } + + private void readHeader() throws FileNotFoundException { + + // Helpers + int len = 0; + StringBuffer mimeBuffer = null; + + // The byte[] that will help us in reading bytes out of the file + final byte[] buffer = new byte[16]; + + // Check whether the file exists + if (!(this.exists())) { + throw new FileNotFoundException( + "The file that you specified was not found."); + } + + // The reader that will be used to read contents from the file + + final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream( + new RandomAccessFile(this, "r")); + + // The ZIM file header + this.mHeader = new Header(); + + // Read the contents of the header + try { + this.mHeader.magicNumber = reader.readFourLittleEndianBytesValue(buffer); + // System.out.println(mHeader.magicNumber); + + this.mHeader.version = reader.readFourLittleEndianBytesValue(buffer); + // System.out.println(mHeader.version); + + this.mHeader.uuid = reader.readSixteenLittleEndianBytesValue(buffer); + // System.out.println(mHeader.uuid); reader.read(buffer, 0, 4); + + this.mHeader.articleCount = reader + .readFourLittleEndianBytesValue(buffer); + // System.out.println(mHeader.articleCount); + + this.mHeader.clusterCount = reader + .readFourLittleEndianBytesValue(buffer); + // System.out.println(mHeader.clusterCount); + + this.mHeader.urlPtrPos = reader.readEightLittleEndianBytesValue(buffer); + // System.out.println(mHeader.urlPtrPos); + + this.mHeader.titlePtrPos = reader + .readEightLittleEndianBytesValue(buffer); + // System.out.println(mHeader.titlePtrPos); + + this.mHeader.clusterPtrPos = reader + .readEightLittleEndianBytesValue(buffer); + // System.out.println(mHeader.clusterPtrPos); + + this.mHeader.mimeListPos = reader + .readEightLittleEndianBytesValue(buffer); + // System.out.println(mHeader.mimeListPos); + + this.mHeader.mainPage = reader.readFourLittleEndianBytesValue(buffer); + // System.out.println(mHeader.mainPage); + + this.mHeader.layoutPage = reader.readFourLittleEndianBytesValue(buffer); + // System.out.println(mHeader.layoutPage); + + // Initialise the MIMETypeList + this.mMIMETypeList = new ArrayList<>(); + while (true) { + reader.read(buffer, 0, 1); + len = 0; + mimeBuffer = new StringBuffer(); + while (buffer[0] != '\0') { + mimeBuffer.append((char) buffer[0]); + reader.read(buffer, 0, 1); + len++; + } + if (len == 0) { + break; + } + this.mMIMETypeList.add(mimeBuffer.toString()); + // System.out.println(mimeBuffer); + } + + } catch (final Exception e) { + e.printStackTrace(); + } + } + + public int getVersion() { + return this.mHeader.version; + } + + public int getUuid() { + return this.mHeader.uuid; + } + + public int getArticleCount() { + return this.mHeader.articleCount; + } + + public int getClusterCount() { + return this.mHeader.clusterCount; + } + + public int getUrlPtrPos() { + return this.mHeader.urlPtrPos; + } + + public int getTitlePtrPos() { + return this.mHeader.titlePtrPos; + } + + public int getClusterPtrPos() { + return this.mHeader.clusterPtrPos; + } + + public String getMIMEType(final int mimeNumber) { + return this.mMIMETypeList.get(mimeNumber); + } + + public int getHeaderSize() { + return this.mHeader.mimeListPos; + } + + public int getMainPage() { + return this.mHeader.mainPage; + } + + public int getLayoutPage() { + return this.mHeader.layoutPage; + } + + public class Header { + int magicNumber; + int version; + int uuid; + int articleCount; + int clusterCount; + int urlPtrPos; + int titlePtrPos; + int clusterPtrPos; + int mimeListPos; + int mainPage; + int layoutPage; + } + +} diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java new file mode 100644 index 000000000..affd6ea6d --- /dev/null +++ b/source/org/openzim/ZIMReader.java @@ -0,0 +1,408 @@ +/* + * Copyright (C) 2011 Arunesh Mathur + * + * This file is a part of zimreader-java. + * + * zimreader-java is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3.0 as + * published by the Free Software Foundation. + * + * zimreader-java is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with zimreader-java. If not, see . + */ + +package org.openzim; + +import java.io.ByteArrayOutputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.ArrayList; +import java.util.List; + +import org.tukaani.xz.SingleXZInputStream; + +/** + * @author Arunesh Mathur + * + * A ZIMReader that reads data from the ZIMFile + * + */ +public class ZIMReader { + + private final ZIMFile mFile; + private RandomAcessFileZIMInputStream mReader; + + public ZIMReader(final ZIMFile file) { + this.mFile = file; + try { + this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile( + this.mFile, "r")); + } catch (final FileNotFoundException e) { + e.printStackTrace(); + } + } + + public List getURLListByURL() throws IOException { + + int i = 0, pos, mimeType; + + final byte[] buffer = new byte[8]; + + // The list that will eventually return the list of URL's + final ArrayList returnList = new ArrayList<>(); + + // Move to the spot where URL's are listed + this.mReader.seek(this.mFile.getUrlPtrPos()); + + for (i = 0; i < this.mFile.getArticleCount(); i++) { + + // The position of URL i + pos = this.mReader.readEightLittleEndianBytesValue(buffer); + + // Mark the current position that we need to return to + this.mReader.mark(); + + // Move to the position of URL i + this.mReader.seek(pos); + + // Article or Redirect entry? + mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer); + + if (mimeType == 65535) { + this.mReader.seek(pos + 12); + returnList.add(this.mReader.readString()); + } else { + this.mReader.seek(pos + 16); + returnList.add(this.mReader.readString()); + } + + this.mReader.reset(); + } + + return returnList; + } + + public List getURLListByTitle() throws IOException { + + int i = 0, pos, mimeType, articleNumber, urlPtrPos; + + final byte[] buffer = new byte[8]; + + // The list that will eventually return the list of URL's + final ArrayList returnList = new ArrayList<>(); + + // Get the UrlPtrPos or one time storage + urlPtrPos = this.mFile.getUrlPtrPos(); + + // Move to the spot where URL's are listed + this.mReader.seek(this.mFile.getTitlePtrPos()); + + for (i = 0; i < this.mFile.getArticleCount(); i++) { + + // The articleNumber of the position of URL i + articleNumber = this.mReader.readFourLittleEndianBytesValue(buffer); + + // Mark the current position that we need to return to + this.mReader.mark(); + + this.mReader.seek(urlPtrPos + (8 * (articleNumber))); + + // The position of URL i + pos = this.mReader.readEightLittleEndianBytesValue(buffer); + this.mReader.seek(pos); + + // Article or Redirect entry? + mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer); + + if (mimeType == 65535) { + this.mReader.seek(pos + 12); + final String url = this.mReader.readString(); + returnList.add(url); + } else { + this.mReader.seek(pos + 16); + final String url = this.mReader.readString(); + returnList.add(url); + } + + // Return to the marked position + this.mReader.reset(); + } + + return returnList; + } + + // Gives the minimum required information needed for the given articleName + public DirectoryEntry getDirectoryInfo(String articleName, final char namespace) + throws IOException { + + DirectoryEntry entry; + String cmpStr; + final int numberOfArticles = this.mFile.getArticleCount(); + int beg = this.mFile.getTitlePtrPos(), end = beg + (numberOfArticles * 4), mid; + + articleName = namespace + "/" + articleName; + + while (beg <= end) { + mid = beg + 4 * (((end - beg) / 4) / 2); + entry = getDirectoryInfoAtTitlePosition(mid); + if (entry == null) { + return null; + } + cmpStr = entry.getNamespace() + "/" + entry.getUrl(); + if (articleName.compareTo(cmpStr) < 0) { + end = mid - 4; + + } else if (articleName.compareTo(cmpStr) > 0) { + beg = mid + 4; + + } else { + return entry; + } + } + + return null; + + } + + public ByteArrayOutputStream getArticleData(final String articleName, final char namespace) throws IOException { + + // search in the cache first, if not found, then call getDirectoryInfo(articleName) + + byte[] buffer = new byte[8]; + + final DirectoryEntry mainEntry = getDirectoryInfo(articleName, namespace); + + if (mainEntry != null) { + + // Check what kind of an entry was mainEnrty + if (mainEntry.getClass() == ArticleEntry.class) { + + // Cast to ArticleEntry + final ArticleEntry article = (ArticleEntry) mainEntry; + + // Get the cluster and blob numbers from the article + final int clusterNumber = article.getClusterNumber(); + final int blobNumber = article.getBlobnumber(); + + // Move to the cluster entry in the clusterPtrPos + this.mReader.seek(this.mFile.getClusterPtrPos() + clusterNumber * 8); + + // Read the location of the cluster + final int clusterPos = this.mReader + .readEightLittleEndianBytesValue(buffer); + + // Move to the cluster + this.mReader.seek(clusterPos); + + // Read the first byte, for compression information + final int compressionType = this.mReader.read(); + + // Reference declaration + SingleXZInputStream xzReader = null; + int firstOffset, numberOfBlobs, offset1, + offset2, + location, + differenceOffset; + + ByteArrayOutputStream baos; + + // Check the compression type that was read + switch (compressionType) { + + // TODO: Read uncompressed data directly + case 0: + case 1: + + // Read the first 4 bytes to find out the number of artciles + buffer = new byte[4]; + + // Create a dictionary with size 40MiB, the zimlib uses this + // size while creating + + // Read the first offset + this.mReader.read(buffer); + + // The first four bytes are the offset of the zeroth blob + firstOffset = Utilities + .toFourLittleEndianInteger(buffer); + + // The number of blobs + numberOfBlobs = firstOffset / 4; + + // The blobNumber has to be lesser than the numberOfBlobs + assert blobNumber < numberOfBlobs; + + + if (blobNumber == 0) { + // The first offset is what we read earlier + offset1 = firstOffset; + } else { + + location = (blobNumber - 1) * 4; + Utilities.skipFully(this.mReader, location); + this.mReader.read(buffer); + offset1 = Utilities.toFourLittleEndianInteger(buffer); + } + + this.mReader.read(buffer); + offset2 = Utilities.toFourLittleEndianInteger(buffer); + + differenceOffset = offset2 - offset1; + buffer = new byte[differenceOffset]; + + Utilities.skipFully(this.mReader, + (offset1 - 4 * (blobNumber + 2))); + + this.mReader.read(buffer, 0, differenceOffset); + + baos = new ByteArrayOutputStream(); + baos.write(buffer, 0, differenceOffset); + + return baos; + + // LZMA2 compressed data + case 4: + + // Read the first 4 bytes to find out the number of artciles + buffer = new byte[4]; + + // Create a dictionary with size 40MiB, the zimlib uses this + // size while creating + xzReader = new SingleXZInputStream(this.mReader, 4194304); + + // Read the first offset + xzReader.read(buffer); + + // The first four bytes are the offset of the zeroth blob + firstOffset = Utilities + .toFourLittleEndianInteger(buffer); + + // The number of blobs + numberOfBlobs = firstOffset / 4; + + // The blobNumber has to be lesser than the numberOfBlobs + assert blobNumber < numberOfBlobs; + + if(blobNumber == 0) { + // The first offset is what we read earlier + offset1 = firstOffset; + } else { + + location = (blobNumber - 1) * 4; + Utilities.skipFully(xzReader, location); + xzReader.read(buffer); + offset1 = Utilities.toFourLittleEndianInteger(buffer); + } + + xzReader.read(buffer); + offset2 = Utilities.toFourLittleEndianInteger(buffer); + + differenceOffset = offset2 - offset1; + buffer = new byte[differenceOffset]; + + Utilities.skipFully(xzReader, + (offset1 - 4 * (blobNumber + 2))); + + xzReader.read(buffer, 0, differenceOffset); + + baos = new ByteArrayOutputStream(); + baos.write(buffer, 0, differenceOffset); + + return baos; + + } + } + } + + return null; + + } + + public DirectoryEntry getDirectoryInfoAtTitlePosition(final int position) + throws IOException { + + // Helpers + int pos; + final byte[] buffer = new byte[8]; + + // At the appropriate position in the titlePtrPos + this.mReader.seek(position); + + // Get value of article at index + pos = this.mReader.readFourLittleEndianBytesValue(buffer); + + // Move to the position in urlPtrPos + this.mReader.seek(this.mFile.getUrlPtrPos() + 8 * pos); + + // Get value of article in urlPtrPos + pos = this.mReader.readEightLittleEndianBytesValue(buffer); + + // Go to the location of the directory entry + this.mReader.seek(pos); + + final int type = this.mReader.readTwoLittleEndianBytesValue(buffer); + + // Ignore the parameter length + this.mReader.read(); + + final char namespace = (char) this.mReader.read(); + // System.out.println("Namepsace: " + namespace); + + final int revision = this.mReader.readFourLittleEndianBytesValue(buffer); + // System.out.println("Revision: " + revision); + + // TODO: Remove redundant if condition code + // Article or Redirect entry + if (type == 65535) { + + // System.out.println("MIMEType: " + type); + + final int redirectIndex = this.mReader.readFourLittleEndianBytesValue(buffer); + // System.out.println("RedirectIndex: " + redirectIndex); + + final String url = this.mReader.readString(); + // System.out.println("URL: " + url); + + String title = this.mReader.readString(); + title = title.equals("") ? url : title; + // System.out.println("Title: " + title); + + return new RedirectEntry(type, namespace, revision, redirectIndex, + url, title, (position - this.mFile.getUrlPtrPos()) / 8); + + } else { + + // System.out.println("MIMEType: " + mFile.getMIMEType(type)); + + final int clusterNumber = this.mReader.readFourLittleEndianBytesValue(buffer); + // System.out.println("Cluster Number: " + clusterNumber); + + final int blobNumber = this.mReader.readFourLittleEndianBytesValue(buffer); + // System.out.println("Blob Number: " + blobNumber); + + final String url = this.mReader.readString(); + // System.out.println("URL: " + url); + + String title = this.mReader.readString(); + title = title.equals("") ? url : title; + // System.out.println("Title: " + title); + + // Parameter data ignored + + return new ArticleEntry(type, namespace, revision, clusterNumber, + blobNumber, url, title, + (position - this.mFile.getUrlPtrPos()) / 8); + } + + } + + public ZIMFile getZIMFile() { + return this.mFile; + } +} diff --git a/source/org/openzim/ZIMTest.java b/source/org/openzim/ZIMTest.java new file mode 100644 index 000000000..6d8ed64fb --- /dev/null +++ b/source/org/openzim/ZIMTest.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2011 Arunesh Mathur + * + * This file is a part of zimreader-java. + * + * zimreader-java is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3.0 as + * published by the Free Software Foundation. + * + * zimreader-java is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with zimreader-java. If not, see . + */ + +package org.openzim; + +import java.io.IOException; + +public class ZIMTest { + public static void main(final String[] args) { + if(args.length!=2) { + System.out.println("Usage: java ZIMTest "); + System.exit(0); + } + + // args[0] is the Zim File's location + final ZIMFile file = new ZIMFile(args[0]); + + // Associate the Zim File with a Reader + final ZIMReader zReader = new ZIMReader(file); + + try { + // args[1] is the name of the articles that is + // to be fetched + System.out.println(zReader.getArticleData(args[1],'A').toString("utf-8")); + } catch (final IOException e) { + e.printStackTrace(); + } + } +}