the raw format reader files with no integration in YaCy yet, which will maybe follow as a next step. The zim file format is documented in https://openzim.org and the reader code was taken from the archived, non-maintained repository at https://github.com/openzim/zimreader-javapull/607/head
parent
4308aa5415
commit
1fefae9baf
@ -0,0 +1,46 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011 Arunesh Mathur
|
||||||
|
*
|
||||||
|
* This file is a part of zimreader-java.
|
||||||
|
*
|
||||||
|
* zimreader-java is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* zimreader-java is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.openzim;
|
||||||
|
|
||||||
|
public class ArticleEntry extends DirectoryEntry {
|
||||||
|
|
||||||
|
int clusterNumber;
|
||||||
|
|
||||||
|
int blobnumber;
|
||||||
|
|
||||||
|
public ArticleEntry(final int mimeType, final char namespace, final int revision,
|
||||||
|
final int clusterNumber, final int blobNumber, final String url, final String title,
|
||||||
|
final int urlListindex) {
|
||||||
|
|
||||||
|
super(mimeType, namespace, revision, url, title, urlListindex);
|
||||||
|
|
||||||
|
this.clusterNumber = clusterNumber;
|
||||||
|
this.blobnumber = blobNumber;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getClusterNumber() {
|
||||||
|
return this.clusterNumber;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBlobnumber() {
|
||||||
|
return this.blobnumber;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,69 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011 Arunesh Mathur
|
||||||
|
*
|
||||||
|
* This file is a part of zimreader-java.
|
||||||
|
*
|
||||||
|
* zimreader-java is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* zimreader-java is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.openzim;
|
||||||
|
|
||||||
|
public abstract class DirectoryEntry {
|
||||||
|
|
||||||
|
int mimeType;
|
||||||
|
|
||||||
|
char namespace;
|
||||||
|
|
||||||
|
int revision;
|
||||||
|
|
||||||
|
String url;
|
||||||
|
|
||||||
|
String title;
|
||||||
|
|
||||||
|
int urlListindex;
|
||||||
|
|
||||||
|
public DirectoryEntry(final int mimeType, final char namespace, final int revision,
|
||||||
|
final String url, final String title, final int index) {
|
||||||
|
this.mimeType = mimeType;
|
||||||
|
this.namespace = namespace;
|
||||||
|
this.revision = revision;
|
||||||
|
this.url = url;
|
||||||
|
this.title = title;
|
||||||
|
this.urlListindex = index;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMimeType() {
|
||||||
|
return this.mimeType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public char getNamespace() {
|
||||||
|
return this.namespace;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRevision() {
|
||||||
|
return this.revision;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getUrl() {
|
||||||
|
return this.url;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTitle() {
|
||||||
|
return this.title;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getUrlListindex() {
|
||||||
|
return this.urlListindex;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,135 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011 Arunesh Mathur
|
||||||
|
*
|
||||||
|
* This file is a part of zimreader-java.
|
||||||
|
*
|
||||||
|
* zimreader-java is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* zimreader-java is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.openzim;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.RandomAccessFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is an implementation of RandomAccessFile to ensure that it is an
|
||||||
|
* InputStream as well, specifically designed for reading a ZIM file. Ad-Hoc
|
||||||
|
* implementation, can be improved.
|
||||||
|
*
|
||||||
|
* @author Arunesh Mathur <aruneshmathur1990 at gmail.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class RandomAcessFileZIMInputStream extends InputStream {
|
||||||
|
|
||||||
|
private final RandomAccessFile mRAFReader;
|
||||||
|
|
||||||
|
private long mMarked = -1;
|
||||||
|
|
||||||
|
public RandomAcessFileZIMInputStream(final RandomAccessFile reader) {
|
||||||
|
this.mRAFReader = reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Remove the parameter buffer
|
||||||
|
public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException {
|
||||||
|
if (buffer.length < 2) {
|
||||||
|
throw new OutOfMemoryError("buffer too small");
|
||||||
|
} else {
|
||||||
|
this.mRAFReader.read(buffer, 0, 2);
|
||||||
|
return Utilities.toTwoLittleEndianInteger(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Remove the parameter buffer
|
||||||
|
public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException {
|
||||||
|
if (buffer.length < 4) {
|
||||||
|
throw new OutOfMemoryError("buffer too small");
|
||||||
|
} else {
|
||||||
|
this.mRAFReader.read(buffer, 0, 4);
|
||||||
|
return Utilities.toFourLittleEndianInteger(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Remove the parameter buffer
|
||||||
|
public int readEightLittleEndianBytesValue(final byte[] buffer)
|
||||||
|
throws IOException {
|
||||||
|
if (buffer.length < 8) {
|
||||||
|
throw new OutOfMemoryError("buffer too small");
|
||||||
|
} else {
|
||||||
|
this.mRAFReader.read(buffer, 0, 8);
|
||||||
|
return Utilities.toEightLittleEndianInteger(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Remove the parameter buffer
|
||||||
|
public int readSixteenLittleEndianBytesValue(final byte[] buffer)
|
||||||
|
throws IOException {
|
||||||
|
if (buffer.length < 16) {
|
||||||
|
throw new OutOfMemoryError("buffer too small");
|
||||||
|
} else {
|
||||||
|
this.mRAFReader.read(buffer, 0, 16);
|
||||||
|
return Utilities.toSixteenLittleEndianInteger(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reads characters from the current position into a String and stops when a
|
||||||
|
// '\0' is encountered
|
||||||
|
public String readString() throws IOException {
|
||||||
|
final StringBuffer sb = new StringBuffer();
|
||||||
|
/*
|
||||||
|
* int i; byte[] buffer = new byte[100]; while (true) {
|
||||||
|
* mRAFReader.read(buffer); for (i = 0; i < buffer.length; i++) { if
|
||||||
|
* (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i
|
||||||
|
* != buffer.length) break; } return sb.toString();
|
||||||
|
*/
|
||||||
|
int b;
|
||||||
|
b = this.mRAFReader.read();
|
||||||
|
while (b != '\0') {
|
||||||
|
sb.append((char) b);
|
||||||
|
b = this.mRAFReader.read();
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read() throws IOException {
|
||||||
|
return this.mRAFReader.read();
|
||||||
|
}
|
||||||
|
|
||||||
|
public RandomAccessFile getRandomAccessFile() {
|
||||||
|
return this.mRAFReader;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void seek(final long pos) throws IOException {
|
||||||
|
this.mRAFReader.seek(pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getFilePointer() throws IOException {
|
||||||
|
return this.mRAFReader.getFilePointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void mark() throws IOException {
|
||||||
|
this.mMarked = this.mRAFReader.getFilePointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
if (this.mMarked == -1) {
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
this.mRAFReader.seek(this.mMarked);
|
||||||
|
this.mMarked = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,37 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011 Arunesh Mathur
|
||||||
|
*
|
||||||
|
* This file is a part of zimreader-java.
|
||||||
|
*
|
||||||
|
* zimreader-java is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* zimreader-java is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.openzim;
|
||||||
|
|
||||||
|
public class RedirectEntry extends DirectoryEntry {
|
||||||
|
|
||||||
|
int redirectIndex;
|
||||||
|
|
||||||
|
public RedirectEntry(final int mimeType, final char namespace, final int revision,
|
||||||
|
final int redirectIndex, final String url, final String title, final int urlListindex) {
|
||||||
|
|
||||||
|
super(mimeType, namespace, revision, url, title, urlListindex);
|
||||||
|
|
||||||
|
this.redirectIndex = redirectIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRedirectIndex() {
|
||||||
|
return this.redirectIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,84 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011 Arunesh Mathur
|
||||||
|
*
|
||||||
|
* This file is a part of zimreader-java.
|
||||||
|
*
|
||||||
|
* zimreader-java is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* zimreader-java is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.openzim;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
public class Utilities {
|
||||||
|
|
||||||
|
// TODO: Write a binary search algorithm
|
||||||
|
public static int binarySearch() {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException {
|
||||||
|
if (buffer.length < 2) {
|
||||||
|
throw new OutOfMemoryError("buffer too small");
|
||||||
|
} else {
|
||||||
|
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int toFourLittleEndianInteger(final byte[] buffer) throws IOException {
|
||||||
|
if (buffer.length < 4) {
|
||||||
|
throw new OutOfMemoryError("buffer too small");
|
||||||
|
} else {
|
||||||
|
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
|
||||||
|
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException {
|
||||||
|
if (buffer.length < 8) {
|
||||||
|
throw new OutOfMemoryError("buffer too small");
|
||||||
|
} else {
|
||||||
|
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
|
||||||
|
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
|
||||||
|
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
|
||||||
|
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException {
|
||||||
|
if (buffer.length < 16) {
|
||||||
|
throw new OutOfMemoryError("buffer too small");
|
||||||
|
} else {
|
||||||
|
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
|
||||||
|
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
|
||||||
|
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
|
||||||
|
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)
|
||||||
|
| ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72)
|
||||||
|
| ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88)
|
||||||
|
| ((buffer[12] & 0xFF) << 96)
|
||||||
|
| ((buffer[13] & 0xFF) << 104)
|
||||||
|
| ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void skipFully(final InputStream stream, final long bytes) throws IOException {
|
||||||
|
for (long i = stream.skip(bytes); i < bytes; i += stream.skip(bytes - i));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,198 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011 Arunesh Mathur
|
||||||
|
*
|
||||||
|
* This file is a part of zimreader-java.
|
||||||
|
*
|
||||||
|
* zimreader-java is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* zimreader-java is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.openzim;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.RandomAccessFile;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author Arunesh Mathur
|
||||||
|
*
|
||||||
|
* A ZIM file implementation that stores the Header and the MIMETypeList
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ZIMFile extends File {
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
|
private Header mHeader;
|
||||||
|
|
||||||
|
private List<String> mMIMETypeList; // Can be removed if not needed
|
||||||
|
|
||||||
|
public ZIMFile(final String path) {
|
||||||
|
super(path);
|
||||||
|
|
||||||
|
try {
|
||||||
|
readHeader();
|
||||||
|
} catch (final FileNotFoundException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readHeader() throws FileNotFoundException {
|
||||||
|
|
||||||
|
// Helpers
|
||||||
|
int len = 0;
|
||||||
|
StringBuffer mimeBuffer = null;
|
||||||
|
|
||||||
|
// The byte[] that will help us in reading bytes out of the file
|
||||||
|
final byte[] buffer = new byte[16];
|
||||||
|
|
||||||
|
// Check whether the file exists
|
||||||
|
if (!(this.exists())) {
|
||||||
|
throw new FileNotFoundException(
|
||||||
|
"The file that you specified was not found.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// The reader that will be used to read contents from the file
|
||||||
|
|
||||||
|
final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(
|
||||||
|
new RandomAccessFile(this, "r"));
|
||||||
|
|
||||||
|
// The ZIM file header
|
||||||
|
this.mHeader = new Header();
|
||||||
|
|
||||||
|
// Read the contents of the header
|
||||||
|
try {
|
||||||
|
this.mHeader.magicNumber = reader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.magicNumber);
|
||||||
|
|
||||||
|
this.mHeader.version = reader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.version);
|
||||||
|
|
||||||
|
this.mHeader.uuid = reader.readSixteenLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.uuid); reader.read(buffer, 0, 4);
|
||||||
|
|
||||||
|
this.mHeader.articleCount = reader
|
||||||
|
.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.articleCount);
|
||||||
|
|
||||||
|
this.mHeader.clusterCount = reader
|
||||||
|
.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.clusterCount);
|
||||||
|
|
||||||
|
this.mHeader.urlPtrPos = reader.readEightLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.urlPtrPos);
|
||||||
|
|
||||||
|
this.mHeader.titlePtrPos = reader
|
||||||
|
.readEightLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.titlePtrPos);
|
||||||
|
|
||||||
|
this.mHeader.clusterPtrPos = reader
|
||||||
|
.readEightLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.clusterPtrPos);
|
||||||
|
|
||||||
|
this.mHeader.mimeListPos = reader
|
||||||
|
.readEightLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.mimeListPos);
|
||||||
|
|
||||||
|
this.mHeader.mainPage = reader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.mainPage);
|
||||||
|
|
||||||
|
this.mHeader.layoutPage = reader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println(mHeader.layoutPage);
|
||||||
|
|
||||||
|
// Initialise the MIMETypeList
|
||||||
|
this.mMIMETypeList = new ArrayList<>();
|
||||||
|
while (true) {
|
||||||
|
reader.read(buffer, 0, 1);
|
||||||
|
len = 0;
|
||||||
|
mimeBuffer = new StringBuffer();
|
||||||
|
while (buffer[0] != '\0') {
|
||||||
|
mimeBuffer.append((char) buffer[0]);
|
||||||
|
reader.read(buffer, 0, 1);
|
||||||
|
len++;
|
||||||
|
}
|
||||||
|
if (len == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
this.mMIMETypeList.add(mimeBuffer.toString());
|
||||||
|
// System.out.println(mimeBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (final Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getVersion() {
|
||||||
|
return this.mHeader.version;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getUuid() {
|
||||||
|
return this.mHeader.uuid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getArticleCount() {
|
||||||
|
return this.mHeader.articleCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getClusterCount() {
|
||||||
|
return this.mHeader.clusterCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getUrlPtrPos() {
|
||||||
|
return this.mHeader.urlPtrPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getTitlePtrPos() {
|
||||||
|
return this.mHeader.titlePtrPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getClusterPtrPos() {
|
||||||
|
return this.mHeader.clusterPtrPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getMIMEType(final int mimeNumber) {
|
||||||
|
return this.mMIMETypeList.get(mimeNumber);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getHeaderSize() {
|
||||||
|
return this.mHeader.mimeListPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMainPage() {
|
||||||
|
return this.mHeader.mainPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getLayoutPage() {
|
||||||
|
return this.mHeader.layoutPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Header {
|
||||||
|
int magicNumber;
|
||||||
|
int version;
|
||||||
|
int uuid;
|
||||||
|
int articleCount;
|
||||||
|
int clusterCount;
|
||||||
|
int urlPtrPos;
|
||||||
|
int titlePtrPos;
|
||||||
|
int clusterPtrPos;
|
||||||
|
int mimeListPos;
|
||||||
|
int mainPage;
|
||||||
|
int layoutPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,408 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011 Arunesh Mathur
|
||||||
|
*
|
||||||
|
* This file is a part of zimreader-java.
|
||||||
|
*
|
||||||
|
* zimreader-java is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* zimreader-java is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.openzim;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.RandomAccessFile;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.tukaani.xz.SingleXZInputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author Arunesh Mathur
|
||||||
|
*
|
||||||
|
* A ZIMReader that reads data from the ZIMFile
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ZIMReader {
|
||||||
|
|
||||||
|
private final ZIMFile mFile;
|
||||||
|
private RandomAcessFileZIMInputStream mReader;
|
||||||
|
|
||||||
|
public ZIMReader(final ZIMFile file) {
|
||||||
|
this.mFile = file;
|
||||||
|
try {
|
||||||
|
this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(
|
||||||
|
this.mFile, "r"));
|
||||||
|
} catch (final FileNotFoundException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getURLListByURL() throws IOException {
|
||||||
|
|
||||||
|
int i = 0, pos, mimeType;
|
||||||
|
|
||||||
|
final byte[] buffer = new byte[8];
|
||||||
|
|
||||||
|
// The list that will eventually return the list of URL's
|
||||||
|
final ArrayList<String> returnList = new ArrayList<>();
|
||||||
|
|
||||||
|
// Move to the spot where URL's are listed
|
||||||
|
this.mReader.seek(this.mFile.getUrlPtrPos());
|
||||||
|
|
||||||
|
for (i = 0; i < this.mFile.getArticleCount(); i++) {
|
||||||
|
|
||||||
|
// The position of URL i
|
||||||
|
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
|
||||||
|
|
||||||
|
// Mark the current position that we need to return to
|
||||||
|
this.mReader.mark();
|
||||||
|
|
||||||
|
// Move to the position of URL i
|
||||||
|
this.mReader.seek(pos);
|
||||||
|
|
||||||
|
// Article or Redirect entry?
|
||||||
|
mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
|
||||||
|
|
||||||
|
if (mimeType == 65535) {
|
||||||
|
this.mReader.seek(pos + 12);
|
||||||
|
returnList.add(this.mReader.readString());
|
||||||
|
} else {
|
||||||
|
this.mReader.seek(pos + 16);
|
||||||
|
returnList.add(this.mReader.readString());
|
||||||
|
}
|
||||||
|
|
||||||
|
this.mReader.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
return returnList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getURLListByTitle() throws IOException {
|
||||||
|
|
||||||
|
int i = 0, pos, mimeType, articleNumber, urlPtrPos;
|
||||||
|
|
||||||
|
final byte[] buffer = new byte[8];
|
||||||
|
|
||||||
|
// The list that will eventually return the list of URL's
|
||||||
|
final ArrayList<String> returnList = new ArrayList<>();
|
||||||
|
|
||||||
|
// Get the UrlPtrPos or one time storage
|
||||||
|
urlPtrPos = this.mFile.getUrlPtrPos();
|
||||||
|
|
||||||
|
// Move to the spot where URL's are listed
|
||||||
|
this.mReader.seek(this.mFile.getTitlePtrPos());
|
||||||
|
|
||||||
|
for (i = 0; i < this.mFile.getArticleCount(); i++) {
|
||||||
|
|
||||||
|
// The articleNumber of the position of URL i
|
||||||
|
articleNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
|
||||||
|
// Mark the current position that we need to return to
|
||||||
|
this.mReader.mark();
|
||||||
|
|
||||||
|
this.mReader.seek(urlPtrPos + (8 * (articleNumber)));
|
||||||
|
|
||||||
|
// The position of URL i
|
||||||
|
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
|
||||||
|
this.mReader.seek(pos);
|
||||||
|
|
||||||
|
// Article or Redirect entry?
|
||||||
|
mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
|
||||||
|
|
||||||
|
if (mimeType == 65535) {
|
||||||
|
this.mReader.seek(pos + 12);
|
||||||
|
final String url = this.mReader.readString();
|
||||||
|
returnList.add(url);
|
||||||
|
} else {
|
||||||
|
this.mReader.seek(pos + 16);
|
||||||
|
final String url = this.mReader.readString();
|
||||||
|
returnList.add(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return to the marked position
|
||||||
|
this.mReader.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
return returnList;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Gives the minimum required information needed for the given articleName
|
||||||
|
public DirectoryEntry getDirectoryInfo(String articleName, final char namespace)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
DirectoryEntry entry;
|
||||||
|
String cmpStr;
|
||||||
|
final int numberOfArticles = this.mFile.getArticleCount();
|
||||||
|
int beg = this.mFile.getTitlePtrPos(), end = beg + (numberOfArticles * 4), mid;
|
||||||
|
|
||||||
|
articleName = namespace + "/" + articleName;
|
||||||
|
|
||||||
|
while (beg <= end) {
|
||||||
|
mid = beg + 4 * (((end - beg) / 4) / 2);
|
||||||
|
entry = getDirectoryInfoAtTitlePosition(mid);
|
||||||
|
if (entry == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
cmpStr = entry.getNamespace() + "/" + entry.getUrl();
|
||||||
|
if (articleName.compareTo(cmpStr) < 0) {
|
||||||
|
end = mid - 4;
|
||||||
|
|
||||||
|
} else if (articleName.compareTo(cmpStr) > 0) {
|
||||||
|
beg = mid + 4;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public ByteArrayOutputStream getArticleData(final String articleName, final char namespace) throws IOException {
|
||||||
|
|
||||||
|
// search in the cache first, if not found, then call getDirectoryInfo(articleName)
|
||||||
|
|
||||||
|
byte[] buffer = new byte[8];
|
||||||
|
|
||||||
|
final DirectoryEntry mainEntry = getDirectoryInfo(articleName, namespace);
|
||||||
|
|
||||||
|
if (mainEntry != null) {
|
||||||
|
|
||||||
|
// Check what kind of an entry was mainEnrty
|
||||||
|
if (mainEntry.getClass() == ArticleEntry.class) {
|
||||||
|
|
||||||
|
// Cast to ArticleEntry
|
||||||
|
final ArticleEntry article = (ArticleEntry) mainEntry;
|
||||||
|
|
||||||
|
// Get the cluster and blob numbers from the article
|
||||||
|
final int clusterNumber = article.getClusterNumber();
|
||||||
|
final int blobNumber = article.getBlobnumber();
|
||||||
|
|
||||||
|
// Move to the cluster entry in the clusterPtrPos
|
||||||
|
this.mReader.seek(this.mFile.getClusterPtrPos() + clusterNumber * 8);
|
||||||
|
|
||||||
|
// Read the location of the cluster
|
||||||
|
final int clusterPos = this.mReader
|
||||||
|
.readEightLittleEndianBytesValue(buffer);
|
||||||
|
|
||||||
|
// Move to the cluster
|
||||||
|
this.mReader.seek(clusterPos);
|
||||||
|
|
||||||
|
// Read the first byte, for compression information
|
||||||
|
final int compressionType = this.mReader.read();
|
||||||
|
|
||||||
|
// Reference declaration
|
||||||
|
SingleXZInputStream xzReader = null;
|
||||||
|
int firstOffset, numberOfBlobs, offset1,
|
||||||
|
offset2,
|
||||||
|
location,
|
||||||
|
differenceOffset;
|
||||||
|
|
||||||
|
ByteArrayOutputStream baos;
|
||||||
|
|
||||||
|
// Check the compression type that was read
|
||||||
|
switch (compressionType) {
|
||||||
|
|
||||||
|
// TODO: Read uncompressed data directly
|
||||||
|
case 0:
|
||||||
|
case 1:
|
||||||
|
|
||||||
|
// Read the first 4 bytes to find out the number of artciles
|
||||||
|
buffer = new byte[4];
|
||||||
|
|
||||||
|
// Create a dictionary with size 40MiB, the zimlib uses this
|
||||||
|
// size while creating
|
||||||
|
|
||||||
|
// Read the first offset
|
||||||
|
this.mReader.read(buffer);
|
||||||
|
|
||||||
|
// The first four bytes are the offset of the zeroth blob
|
||||||
|
firstOffset = Utilities
|
||||||
|
.toFourLittleEndianInteger(buffer);
|
||||||
|
|
||||||
|
// The number of blobs
|
||||||
|
numberOfBlobs = firstOffset / 4;
|
||||||
|
|
||||||
|
// The blobNumber has to be lesser than the numberOfBlobs
|
||||||
|
assert blobNumber < numberOfBlobs;
|
||||||
|
|
||||||
|
|
||||||
|
if (blobNumber == 0) {
|
||||||
|
// The first offset is what we read earlier
|
||||||
|
offset1 = firstOffset;
|
||||||
|
} else {
|
||||||
|
|
||||||
|
location = (blobNumber - 1) * 4;
|
||||||
|
Utilities.skipFully(this.mReader, location);
|
||||||
|
this.mReader.read(buffer);
|
||||||
|
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.mReader.read(buffer);
|
||||||
|
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
||||||
|
|
||||||
|
differenceOffset = offset2 - offset1;
|
||||||
|
buffer = new byte[differenceOffset];
|
||||||
|
|
||||||
|
Utilities.skipFully(this.mReader,
|
||||||
|
(offset1 - 4 * (blobNumber + 2)));
|
||||||
|
|
||||||
|
this.mReader.read(buffer, 0, differenceOffset);
|
||||||
|
|
||||||
|
baos = new ByteArrayOutputStream();
|
||||||
|
baos.write(buffer, 0, differenceOffset);
|
||||||
|
|
||||||
|
return baos;
|
||||||
|
|
||||||
|
// LZMA2 compressed data
|
||||||
|
case 4:
|
||||||
|
|
||||||
|
// Read the first 4 bytes to find out the number of artciles
|
||||||
|
buffer = new byte[4];
|
||||||
|
|
||||||
|
// Create a dictionary with size 40MiB, the zimlib uses this
|
||||||
|
// size while creating
|
||||||
|
xzReader = new SingleXZInputStream(this.mReader, 4194304);
|
||||||
|
|
||||||
|
// Read the first offset
|
||||||
|
xzReader.read(buffer);
|
||||||
|
|
||||||
|
// The first four bytes are the offset of the zeroth blob
|
||||||
|
firstOffset = Utilities
|
||||||
|
.toFourLittleEndianInteger(buffer);
|
||||||
|
|
||||||
|
// The number of blobs
|
||||||
|
numberOfBlobs = firstOffset / 4;
|
||||||
|
|
||||||
|
// The blobNumber has to be lesser than the numberOfBlobs
|
||||||
|
assert blobNumber < numberOfBlobs;
|
||||||
|
|
||||||
|
if(blobNumber == 0) {
|
||||||
|
// The first offset is what we read earlier
|
||||||
|
offset1 = firstOffset;
|
||||||
|
} else {
|
||||||
|
|
||||||
|
location = (blobNumber - 1) * 4;
|
||||||
|
Utilities.skipFully(xzReader, location);
|
||||||
|
xzReader.read(buffer);
|
||||||
|
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
xzReader.read(buffer);
|
||||||
|
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
||||||
|
|
||||||
|
differenceOffset = offset2 - offset1;
|
||||||
|
buffer = new byte[differenceOffset];
|
||||||
|
|
||||||
|
Utilities.skipFully(xzReader,
|
||||||
|
(offset1 - 4 * (blobNumber + 2)));
|
||||||
|
|
||||||
|
xzReader.read(buffer, 0, differenceOffset);
|
||||||
|
|
||||||
|
baos = new ByteArrayOutputStream();
|
||||||
|
baos.write(buffer, 0, differenceOffset);
|
||||||
|
|
||||||
|
return baos;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public DirectoryEntry getDirectoryInfoAtTitlePosition(final int position)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
// Helpers
|
||||||
|
int pos;
|
||||||
|
final byte[] buffer = new byte[8];
|
||||||
|
|
||||||
|
// At the appropriate position in the titlePtrPos
|
||||||
|
this.mReader.seek(position);
|
||||||
|
|
||||||
|
// Get value of article at index
|
||||||
|
pos = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
|
||||||
|
// Move to the position in urlPtrPos
|
||||||
|
this.mReader.seek(this.mFile.getUrlPtrPos() + 8 * pos);
|
||||||
|
|
||||||
|
// Get value of article in urlPtrPos
|
||||||
|
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
|
||||||
|
|
||||||
|
// Go to the location of the directory entry
|
||||||
|
this.mReader.seek(pos);
|
||||||
|
|
||||||
|
final int type = this.mReader.readTwoLittleEndianBytesValue(buffer);
|
||||||
|
|
||||||
|
// Ignore the parameter length
|
||||||
|
this.mReader.read();
|
||||||
|
|
||||||
|
final char namespace = (char) this.mReader.read();
|
||||||
|
// System.out.println("Namepsace: " + namespace);
|
||||||
|
|
||||||
|
final int revision = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println("Revision: " + revision);
|
||||||
|
|
||||||
|
// TODO: Remove redundant if condition code
|
||||||
|
// Article or Redirect entry
|
||||||
|
if (type == 65535) {
|
||||||
|
|
||||||
|
// System.out.println("MIMEType: " + type);
|
||||||
|
|
||||||
|
final int redirectIndex = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println("RedirectIndex: " + redirectIndex);
|
||||||
|
|
||||||
|
final String url = this.mReader.readString();
|
||||||
|
// System.out.println("URL: " + url);
|
||||||
|
|
||||||
|
String title = this.mReader.readString();
|
||||||
|
title = title.equals("") ? url : title;
|
||||||
|
// System.out.println("Title: " + title);
|
||||||
|
|
||||||
|
return new RedirectEntry(type, namespace, revision, redirectIndex,
|
||||||
|
url, title, (position - this.mFile.getUrlPtrPos()) / 8);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// System.out.println("MIMEType: " + mFile.getMIMEType(type));
|
||||||
|
|
||||||
|
final int clusterNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println("Cluster Number: " + clusterNumber);
|
||||||
|
|
||||||
|
final int blobNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||||
|
// System.out.println("Blob Number: " + blobNumber);
|
||||||
|
|
||||||
|
final String url = this.mReader.readString();
|
||||||
|
// System.out.println("URL: " + url);
|
||||||
|
|
||||||
|
String title = this.mReader.readString();
|
||||||
|
title = title.equals("") ? url : title;
|
||||||
|
// System.out.println("Title: " + title);
|
||||||
|
|
||||||
|
// Parameter data ignored
|
||||||
|
|
||||||
|
return new ArticleEntry(type, namespace, revision, clusterNumber,
|
||||||
|
blobNumber, url, title,
|
||||||
|
(position - this.mFile.getUrlPtrPos()) / 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public ZIMFile getZIMFile() {
|
||||||
|
return this.mFile;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2011 Arunesh Mathur
|
||||||
|
*
|
||||||
|
* This file is a part of zimreader-java.
|
||||||
|
*
|
||||||
|
* zimreader-java is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* zimreader-java is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.openzim;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class ZIMTest {
|
||||||
|
public static void main(final String[] args) {
|
||||||
|
if(args.length!=2) {
|
||||||
|
System.out.println("Usage: java ZIMTest <ZIM_FILE> <ARTICLE_NAME>");
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// args[0] is the Zim File's location
|
||||||
|
final ZIMFile file = new ZIMFile(args[0]);
|
||||||
|
|
||||||
|
// Associate the Zim File with a Reader
|
||||||
|
final ZIMReader zReader = new ZIMReader(file);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// args[1] is the name of the articles that is
|
||||||
|
// to be fetched
|
||||||
|
System.out.println(zReader.getArticleData(args[1],'A').toString("utf-8"));
|
||||||
|
} catch (final IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in new issue