the raw format reader files with no integration in YaCy yet, which will maybe follow as a next step. The zim file format is documented in https://openzim.org and the reader code was taken from the archived, non-maintained repository at https://github.com/openzim/zimreader-javapull/607/head
parent
4308aa5415
commit
1fefae9baf
@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Arunesh Mathur
|
||||
*
|
||||
* This file is a part of zimreader-java.
|
||||
*
|
||||
* zimreader-java is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* zimreader-java is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package org.openzim;
|
||||
|
||||
public class ArticleEntry extends DirectoryEntry {
|
||||
|
||||
int clusterNumber;
|
||||
|
||||
int blobnumber;
|
||||
|
||||
public ArticleEntry(final int mimeType, final char namespace, final int revision,
|
||||
final int clusterNumber, final int blobNumber, final String url, final String title,
|
||||
final int urlListindex) {
|
||||
|
||||
super(mimeType, namespace, revision, url, title, urlListindex);
|
||||
|
||||
this.clusterNumber = clusterNumber;
|
||||
this.blobnumber = blobNumber;
|
||||
}
|
||||
|
||||
public int getClusterNumber() {
|
||||
return this.clusterNumber;
|
||||
}
|
||||
|
||||
public int getBlobnumber() {
|
||||
return this.blobnumber;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Arunesh Mathur
|
||||
*
|
||||
* This file is a part of zimreader-java.
|
||||
*
|
||||
* zimreader-java is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* zimreader-java is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package org.openzim;
|
||||
|
||||
public abstract class DirectoryEntry {
|
||||
|
||||
int mimeType;
|
||||
|
||||
char namespace;
|
||||
|
||||
int revision;
|
||||
|
||||
String url;
|
||||
|
||||
String title;
|
||||
|
||||
int urlListindex;
|
||||
|
||||
public DirectoryEntry(final int mimeType, final char namespace, final int revision,
|
||||
final String url, final String title, final int index) {
|
||||
this.mimeType = mimeType;
|
||||
this.namespace = namespace;
|
||||
this.revision = revision;
|
||||
this.url = url;
|
||||
this.title = title;
|
||||
this.urlListindex = index;
|
||||
}
|
||||
|
||||
public int getMimeType() {
|
||||
return this.mimeType;
|
||||
}
|
||||
|
||||
public char getNamespace() {
|
||||
return this.namespace;
|
||||
}
|
||||
|
||||
public int getRevision() {
|
||||
return this.revision;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return this.url;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return this.title;
|
||||
}
|
||||
|
||||
public int getUrlListindex() {
|
||||
return this.urlListindex;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Arunesh Mathur
|
||||
*
|
||||
* This file is a part of zimreader-java.
|
||||
*
|
||||
* zimreader-java is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* zimreader-java is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package org.openzim;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.RandomAccessFile;
|
||||
|
||||
/**
|
||||
* This is an implementation of RandomAccessFile to ensure that it is an
|
||||
* InputStream as well, specifically designed for reading a ZIM file. Ad-Hoc
|
||||
* implementation, can be improved.
|
||||
*
|
||||
* @author Arunesh Mathur <aruneshmathur1990 at gmail.com>
|
||||
*/
|
||||
|
||||
public class RandomAcessFileZIMInputStream extends InputStream {
|
||||
|
||||
private final RandomAccessFile mRAFReader;
|
||||
|
||||
private long mMarked = -1;
|
||||
|
||||
public RandomAcessFileZIMInputStream(final RandomAccessFile reader) {
|
||||
this.mRAFReader = reader;
|
||||
}
|
||||
|
||||
// TODO: Remove the parameter buffer
|
||||
public int readTwoLittleEndianBytesValue(final byte[] buffer) throws IOException {
|
||||
if (buffer.length < 2) {
|
||||
throw new OutOfMemoryError("buffer too small");
|
||||
} else {
|
||||
this.mRAFReader.read(buffer, 0, 2);
|
||||
return Utilities.toTwoLittleEndianInteger(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Remove the parameter buffer
|
||||
public int readFourLittleEndianBytesValue(final byte[] buffer) throws IOException {
|
||||
if (buffer.length < 4) {
|
||||
throw new OutOfMemoryError("buffer too small");
|
||||
} else {
|
||||
this.mRAFReader.read(buffer, 0, 4);
|
||||
return Utilities.toFourLittleEndianInteger(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Remove the parameter buffer
|
||||
public int readEightLittleEndianBytesValue(final byte[] buffer)
|
||||
throws IOException {
|
||||
if (buffer.length < 8) {
|
||||
throw new OutOfMemoryError("buffer too small");
|
||||
} else {
|
||||
this.mRAFReader.read(buffer, 0, 8);
|
||||
return Utilities.toEightLittleEndianInteger(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Remove the parameter buffer
|
||||
public int readSixteenLittleEndianBytesValue(final byte[] buffer)
|
||||
throws IOException {
|
||||
if (buffer.length < 16) {
|
||||
throw new OutOfMemoryError("buffer too small");
|
||||
} else {
|
||||
this.mRAFReader.read(buffer, 0, 16);
|
||||
return Utilities.toSixteenLittleEndianInteger(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
// Reads characters from the current position into a String and stops when a
|
||||
// '\0' is encountered
|
||||
public String readString() throws IOException {
|
||||
final StringBuffer sb = new StringBuffer();
|
||||
/*
|
||||
* int i; byte[] buffer = new byte[100]; while (true) {
|
||||
* mRAFReader.read(buffer); for (i = 0; i < buffer.length; i++) { if
|
||||
* (buffer[i] == '\0') { break; } sb.append((char) buffer[i]); } if (i
|
||||
* != buffer.length) break; } return sb.toString();
|
||||
*/
|
||||
int b;
|
||||
b = this.mRAFReader.read();
|
||||
while (b != '\0') {
|
||||
sb.append((char) b);
|
||||
b = this.mRAFReader.read();
|
||||
}
|
||||
return sb.toString();
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
return this.mRAFReader.read();
|
||||
}
|
||||
|
||||
public RandomAccessFile getRandomAccessFile() {
|
||||
return this.mRAFReader;
|
||||
}
|
||||
|
||||
public void seek(final long pos) throws IOException {
|
||||
this.mRAFReader.seek(pos);
|
||||
}
|
||||
|
||||
public long getFilePointer() throws IOException {
|
||||
return this.mRAFReader.getFilePointer();
|
||||
}
|
||||
|
||||
public void mark() throws IOException {
|
||||
this.mMarked = this.mRAFReader.getFilePointer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
if (this.mMarked == -1) {
|
||||
return;
|
||||
} else {
|
||||
this.mRAFReader.seek(this.mMarked);
|
||||
this.mMarked = -1;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Arunesh Mathur
|
||||
*
|
||||
* This file is a part of zimreader-java.
|
||||
*
|
||||
* zimreader-java is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* zimreader-java is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package org.openzim;
|
||||
|
||||
public class RedirectEntry extends DirectoryEntry {
|
||||
|
||||
int redirectIndex;
|
||||
|
||||
public RedirectEntry(final int mimeType, final char namespace, final int revision,
|
||||
final int redirectIndex, final String url, final String title, final int urlListindex) {
|
||||
|
||||
super(mimeType, namespace, revision, url, title, urlListindex);
|
||||
|
||||
this.redirectIndex = redirectIndex;
|
||||
}
|
||||
|
||||
public int getRedirectIndex() {
|
||||
return this.redirectIndex;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Arunesh Mathur
|
||||
*
|
||||
* This file is a part of zimreader-java.
|
||||
*
|
||||
* zimreader-java is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* zimreader-java is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package org.openzim;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
public class Utilities {
|
||||
|
||||
// TODO: Write a binary search algorithm
|
||||
public static int binarySearch() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
public static int toTwoLittleEndianInteger(final byte[] buffer) throws IOException {
|
||||
if (buffer.length < 2) {
|
||||
throw new OutOfMemoryError("buffer too small");
|
||||
} else {
|
||||
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8));
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public static int toFourLittleEndianInteger(final byte[] buffer) throws IOException {
|
||||
if (buffer.length < 4) {
|
||||
throw new OutOfMemoryError("buffer too small");
|
||||
} else {
|
||||
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
|
||||
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public static int toEightLittleEndianInteger(final byte[] buffer) throws IOException {
|
||||
if (buffer.length < 8) {
|
||||
throw new OutOfMemoryError("buffer too small");
|
||||
} else {
|
||||
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
|
||||
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
|
||||
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
|
||||
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56));
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public static int toSixteenLittleEndianInteger(final byte[] buffer) throws IOException {
|
||||
if (buffer.length < 16) {
|
||||
throw new OutOfMemoryError("buffer too small");
|
||||
} else {
|
||||
final int result = ((buffer[0] & 0xFF) | ((buffer[1] & 0xFF) << 8)
|
||||
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)
|
||||
| ((buffer[4] & 0xFF) << 32) | ((buffer[5] & 0xFF) << 40)
|
||||
| ((buffer[6] & 0xFF) << 48) | ((buffer[7] & 0xFF) << 56)
|
||||
| ((buffer[8] & 0xFF) << 64) | ((buffer[9] & 0xFF) << 72)
|
||||
| ((buffer[10] & 0xFF) << 80) | ((buffer[11] & 0xFF) << 88)
|
||||
| ((buffer[12] & 0xFF) << 96)
|
||||
| ((buffer[13] & 0xFF) << 104)
|
||||
| ((buffer[14] & 0xFF) << 112) | ((buffer[15] & 0xFF) << 120));
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public static void skipFully(final InputStream stream, final long bytes) throws IOException {
|
||||
for (long i = stream.skip(bytes); i < bytes; i += stream.skip(bytes - i));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,198 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Arunesh Mathur
|
||||
*
|
||||
* This file is a part of zimreader-java.
|
||||
*
|
||||
* zimreader-java is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* zimreader-java is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package org.openzim;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author Arunesh Mathur
|
||||
*
|
||||
* A ZIM file implementation that stores the Header and the MIMETypeList
|
||||
*
|
||||
*/
|
||||
public class ZIMFile extends File {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private Header mHeader;
|
||||
|
||||
private List<String> mMIMETypeList; // Can be removed if not needed
|
||||
|
||||
public ZIMFile(final String path) {
|
||||
super(path);
|
||||
|
||||
try {
|
||||
readHeader();
|
||||
} catch (final FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private void readHeader() throws FileNotFoundException {
|
||||
|
||||
// Helpers
|
||||
int len = 0;
|
||||
StringBuffer mimeBuffer = null;
|
||||
|
||||
// The byte[] that will help us in reading bytes out of the file
|
||||
final byte[] buffer = new byte[16];
|
||||
|
||||
// Check whether the file exists
|
||||
if (!(this.exists())) {
|
||||
throw new FileNotFoundException(
|
||||
"The file that you specified was not found.");
|
||||
}
|
||||
|
||||
// The reader that will be used to read contents from the file
|
||||
|
||||
final RandomAcessFileZIMInputStream reader = new RandomAcessFileZIMInputStream(
|
||||
new RandomAccessFile(this, "r"));
|
||||
|
||||
// The ZIM file header
|
||||
this.mHeader = new Header();
|
||||
|
||||
// Read the contents of the header
|
||||
try {
|
||||
this.mHeader.magicNumber = reader.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.magicNumber);
|
||||
|
||||
this.mHeader.version = reader.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.version);
|
||||
|
||||
this.mHeader.uuid = reader.readSixteenLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.uuid); reader.read(buffer, 0, 4);
|
||||
|
||||
this.mHeader.articleCount = reader
|
||||
.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.articleCount);
|
||||
|
||||
this.mHeader.clusterCount = reader
|
||||
.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.clusterCount);
|
||||
|
||||
this.mHeader.urlPtrPos = reader.readEightLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.urlPtrPos);
|
||||
|
||||
this.mHeader.titlePtrPos = reader
|
||||
.readEightLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.titlePtrPos);
|
||||
|
||||
this.mHeader.clusterPtrPos = reader
|
||||
.readEightLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.clusterPtrPos);
|
||||
|
||||
this.mHeader.mimeListPos = reader
|
||||
.readEightLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.mimeListPos);
|
||||
|
||||
this.mHeader.mainPage = reader.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.mainPage);
|
||||
|
||||
this.mHeader.layoutPage = reader.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println(mHeader.layoutPage);
|
||||
|
||||
// Initialise the MIMETypeList
|
||||
this.mMIMETypeList = new ArrayList<>();
|
||||
while (true) {
|
||||
reader.read(buffer, 0, 1);
|
||||
len = 0;
|
||||
mimeBuffer = new StringBuffer();
|
||||
while (buffer[0] != '\0') {
|
||||
mimeBuffer.append((char) buffer[0]);
|
||||
reader.read(buffer, 0, 1);
|
||||
len++;
|
||||
}
|
||||
if (len == 0) {
|
||||
break;
|
||||
}
|
||||
this.mMIMETypeList.add(mimeBuffer.toString());
|
||||
// System.out.println(mimeBuffer);
|
||||
}
|
||||
|
||||
} catch (final Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public int getVersion() {
|
||||
return this.mHeader.version;
|
||||
}
|
||||
|
||||
public int getUuid() {
|
||||
return this.mHeader.uuid;
|
||||
}
|
||||
|
||||
public int getArticleCount() {
|
||||
return this.mHeader.articleCount;
|
||||
}
|
||||
|
||||
public int getClusterCount() {
|
||||
return this.mHeader.clusterCount;
|
||||
}
|
||||
|
||||
public int getUrlPtrPos() {
|
||||
return this.mHeader.urlPtrPos;
|
||||
}
|
||||
|
||||
public int getTitlePtrPos() {
|
||||
return this.mHeader.titlePtrPos;
|
||||
}
|
||||
|
||||
public int getClusterPtrPos() {
|
||||
return this.mHeader.clusterPtrPos;
|
||||
}
|
||||
|
||||
public String getMIMEType(final int mimeNumber) {
|
||||
return this.mMIMETypeList.get(mimeNumber);
|
||||
}
|
||||
|
||||
public int getHeaderSize() {
|
||||
return this.mHeader.mimeListPos;
|
||||
}
|
||||
|
||||
public int getMainPage() {
|
||||
return this.mHeader.mainPage;
|
||||
}
|
||||
|
||||
public int getLayoutPage() {
|
||||
return this.mHeader.layoutPage;
|
||||
}
|
||||
|
||||
public class Header {
|
||||
int magicNumber;
|
||||
int version;
|
||||
int uuid;
|
||||
int articleCount;
|
||||
int clusterCount;
|
||||
int urlPtrPos;
|
||||
int titlePtrPos;
|
||||
int clusterPtrPos;
|
||||
int mimeListPos;
|
||||
int mainPage;
|
||||
int layoutPage;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,408 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Arunesh Mathur
|
||||
*
|
||||
* This file is a part of zimreader-java.
|
||||
*
|
||||
* zimreader-java is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* zimreader-java is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package org.openzim;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.tukaani.xz.SingleXZInputStream;
|
||||
|
||||
/**
|
||||
* @author Arunesh Mathur
|
||||
*
|
||||
* A ZIMReader that reads data from the ZIMFile
|
||||
*
|
||||
*/
|
||||
public class ZIMReader {
|
||||
|
||||
private final ZIMFile mFile;
|
||||
private RandomAcessFileZIMInputStream mReader;
|
||||
|
||||
public ZIMReader(final ZIMFile file) {
|
||||
this.mFile = file;
|
||||
try {
|
||||
this.mReader = new RandomAcessFileZIMInputStream(new RandomAccessFile(
|
||||
this.mFile, "r"));
|
||||
} catch (final FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getURLListByURL() throws IOException {
|
||||
|
||||
int i = 0, pos, mimeType;
|
||||
|
||||
final byte[] buffer = new byte[8];
|
||||
|
||||
// The list that will eventually return the list of URL's
|
||||
final ArrayList<String> returnList = new ArrayList<>();
|
||||
|
||||
// Move to the spot where URL's are listed
|
||||
this.mReader.seek(this.mFile.getUrlPtrPos());
|
||||
|
||||
for (i = 0; i < this.mFile.getArticleCount(); i++) {
|
||||
|
||||
// The position of URL i
|
||||
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
|
||||
|
||||
// Mark the current position that we need to return to
|
||||
this.mReader.mark();
|
||||
|
||||
// Move to the position of URL i
|
||||
this.mReader.seek(pos);
|
||||
|
||||
// Article or Redirect entry?
|
||||
mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
|
||||
|
||||
if (mimeType == 65535) {
|
||||
this.mReader.seek(pos + 12);
|
||||
returnList.add(this.mReader.readString());
|
||||
} else {
|
||||
this.mReader.seek(pos + 16);
|
||||
returnList.add(this.mReader.readString());
|
||||
}
|
||||
|
||||
this.mReader.reset();
|
||||
}
|
||||
|
||||
return returnList;
|
||||
}
|
||||
|
||||
public List<String> getURLListByTitle() throws IOException {
|
||||
|
||||
int i = 0, pos, mimeType, articleNumber, urlPtrPos;
|
||||
|
||||
final byte[] buffer = new byte[8];
|
||||
|
||||
// The list that will eventually return the list of URL's
|
||||
final ArrayList<String> returnList = new ArrayList<>();
|
||||
|
||||
// Get the UrlPtrPos or one time storage
|
||||
urlPtrPos = this.mFile.getUrlPtrPos();
|
||||
|
||||
// Move to the spot where URL's are listed
|
||||
this.mReader.seek(this.mFile.getTitlePtrPos());
|
||||
|
||||
for (i = 0; i < this.mFile.getArticleCount(); i++) {
|
||||
|
||||
// The articleNumber of the position of URL i
|
||||
articleNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||
|
||||
// Mark the current position that we need to return to
|
||||
this.mReader.mark();
|
||||
|
||||
this.mReader.seek(urlPtrPos + (8 * (articleNumber)));
|
||||
|
||||
// The position of URL i
|
||||
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
|
||||
this.mReader.seek(pos);
|
||||
|
||||
// Article or Redirect entry?
|
||||
mimeType = this.mReader.readTwoLittleEndianBytesValue(buffer);
|
||||
|
||||
if (mimeType == 65535) {
|
||||
this.mReader.seek(pos + 12);
|
||||
final String url = this.mReader.readString();
|
||||
returnList.add(url);
|
||||
} else {
|
||||
this.mReader.seek(pos + 16);
|
||||
final String url = this.mReader.readString();
|
||||
returnList.add(url);
|
||||
}
|
||||
|
||||
// Return to the marked position
|
||||
this.mReader.reset();
|
||||
}
|
||||
|
||||
return returnList;
|
||||
}
|
||||
|
||||
// Gives the minimum required information needed for the given articleName
|
||||
public DirectoryEntry getDirectoryInfo(String articleName, final char namespace)
|
||||
throws IOException {
|
||||
|
||||
DirectoryEntry entry;
|
||||
String cmpStr;
|
||||
final int numberOfArticles = this.mFile.getArticleCount();
|
||||
int beg = this.mFile.getTitlePtrPos(), end = beg + (numberOfArticles * 4), mid;
|
||||
|
||||
articleName = namespace + "/" + articleName;
|
||||
|
||||
while (beg <= end) {
|
||||
mid = beg + 4 * (((end - beg) / 4) / 2);
|
||||
entry = getDirectoryInfoAtTitlePosition(mid);
|
||||
if (entry == null) {
|
||||
return null;
|
||||
}
|
||||
cmpStr = entry.getNamespace() + "/" + entry.getUrl();
|
||||
if (articleName.compareTo(cmpStr) < 0) {
|
||||
end = mid - 4;
|
||||
|
||||
} else if (articleName.compareTo(cmpStr) > 0) {
|
||||
beg = mid + 4;
|
||||
|
||||
} else {
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
public ByteArrayOutputStream getArticleData(final String articleName, final char namespace) throws IOException {
|
||||
|
||||
// search in the cache first, if not found, then call getDirectoryInfo(articleName)
|
||||
|
||||
byte[] buffer = new byte[8];
|
||||
|
||||
final DirectoryEntry mainEntry = getDirectoryInfo(articleName, namespace);
|
||||
|
||||
if (mainEntry != null) {
|
||||
|
||||
// Check what kind of an entry was mainEnrty
|
||||
if (mainEntry.getClass() == ArticleEntry.class) {
|
||||
|
||||
// Cast to ArticleEntry
|
||||
final ArticleEntry article = (ArticleEntry) mainEntry;
|
||||
|
||||
// Get the cluster and blob numbers from the article
|
||||
final int clusterNumber = article.getClusterNumber();
|
||||
final int blobNumber = article.getBlobnumber();
|
||||
|
||||
// Move to the cluster entry in the clusterPtrPos
|
||||
this.mReader.seek(this.mFile.getClusterPtrPos() + clusterNumber * 8);
|
||||
|
||||
// Read the location of the cluster
|
||||
final int clusterPos = this.mReader
|
||||
.readEightLittleEndianBytesValue(buffer);
|
||||
|
||||
// Move to the cluster
|
||||
this.mReader.seek(clusterPos);
|
||||
|
||||
// Read the first byte, for compression information
|
||||
final int compressionType = this.mReader.read();
|
||||
|
||||
// Reference declaration
|
||||
SingleXZInputStream xzReader = null;
|
||||
int firstOffset, numberOfBlobs, offset1,
|
||||
offset2,
|
||||
location,
|
||||
differenceOffset;
|
||||
|
||||
ByteArrayOutputStream baos;
|
||||
|
||||
// Check the compression type that was read
|
||||
switch (compressionType) {
|
||||
|
||||
// TODO: Read uncompressed data directly
|
||||
case 0:
|
||||
case 1:
|
||||
|
||||
// Read the first 4 bytes to find out the number of artciles
|
||||
buffer = new byte[4];
|
||||
|
||||
// Create a dictionary with size 40MiB, the zimlib uses this
|
||||
// size while creating
|
||||
|
||||
// Read the first offset
|
||||
this.mReader.read(buffer);
|
||||
|
||||
// The first four bytes are the offset of the zeroth blob
|
||||
firstOffset = Utilities
|
||||
.toFourLittleEndianInteger(buffer);
|
||||
|
||||
// The number of blobs
|
||||
numberOfBlobs = firstOffset / 4;
|
||||
|
||||
// The blobNumber has to be lesser than the numberOfBlobs
|
||||
assert blobNumber < numberOfBlobs;
|
||||
|
||||
|
||||
if (blobNumber == 0) {
|
||||
// The first offset is what we read earlier
|
||||
offset1 = firstOffset;
|
||||
} else {
|
||||
|
||||
location = (blobNumber - 1) * 4;
|
||||
Utilities.skipFully(this.mReader, location);
|
||||
this.mReader.read(buffer);
|
||||
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
}
|
||||
|
||||
this.mReader.read(buffer);
|
||||
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
|
||||
differenceOffset = offset2 - offset1;
|
||||
buffer = new byte[differenceOffset];
|
||||
|
||||
Utilities.skipFully(this.mReader,
|
||||
(offset1 - 4 * (blobNumber + 2)));
|
||||
|
||||
this.mReader.read(buffer, 0, differenceOffset);
|
||||
|
||||
baos = new ByteArrayOutputStream();
|
||||
baos.write(buffer, 0, differenceOffset);
|
||||
|
||||
return baos;
|
||||
|
||||
// LZMA2 compressed data
|
||||
case 4:
|
||||
|
||||
// Read the first 4 bytes to find out the number of artciles
|
||||
buffer = new byte[4];
|
||||
|
||||
// Create a dictionary with size 40MiB, the zimlib uses this
|
||||
// size while creating
|
||||
xzReader = new SingleXZInputStream(this.mReader, 4194304);
|
||||
|
||||
// Read the first offset
|
||||
xzReader.read(buffer);
|
||||
|
||||
// The first four bytes are the offset of the zeroth blob
|
||||
firstOffset = Utilities
|
||||
.toFourLittleEndianInteger(buffer);
|
||||
|
||||
// The number of blobs
|
||||
numberOfBlobs = firstOffset / 4;
|
||||
|
||||
// The blobNumber has to be lesser than the numberOfBlobs
|
||||
assert blobNumber < numberOfBlobs;
|
||||
|
||||
if(blobNumber == 0) {
|
||||
// The first offset is what we read earlier
|
||||
offset1 = firstOffset;
|
||||
} else {
|
||||
|
||||
location = (blobNumber - 1) * 4;
|
||||
Utilities.skipFully(xzReader, location);
|
||||
xzReader.read(buffer);
|
||||
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
}
|
||||
|
||||
xzReader.read(buffer);
|
||||
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
|
||||
differenceOffset = offset2 - offset1;
|
||||
buffer = new byte[differenceOffset];
|
||||
|
||||
Utilities.skipFully(xzReader,
|
||||
(offset1 - 4 * (blobNumber + 2)));
|
||||
|
||||
xzReader.read(buffer, 0, differenceOffset);
|
||||
|
||||
baos = new ByteArrayOutputStream();
|
||||
baos.write(buffer, 0, differenceOffset);
|
||||
|
||||
return baos;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
public DirectoryEntry getDirectoryInfoAtTitlePosition(final int position)
|
||||
throws IOException {
|
||||
|
||||
// Helpers
|
||||
int pos;
|
||||
final byte[] buffer = new byte[8];
|
||||
|
||||
// At the appropriate position in the titlePtrPos
|
||||
this.mReader.seek(position);
|
||||
|
||||
// Get value of article at index
|
||||
pos = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||
|
||||
// Move to the position in urlPtrPos
|
||||
this.mReader.seek(this.mFile.getUrlPtrPos() + 8 * pos);
|
||||
|
||||
// Get value of article in urlPtrPos
|
||||
pos = this.mReader.readEightLittleEndianBytesValue(buffer);
|
||||
|
||||
// Go to the location of the directory entry
|
||||
this.mReader.seek(pos);
|
||||
|
||||
final int type = this.mReader.readTwoLittleEndianBytesValue(buffer);
|
||||
|
||||
// Ignore the parameter length
|
||||
this.mReader.read();
|
||||
|
||||
final char namespace = (char) this.mReader.read();
|
||||
// System.out.println("Namepsace: " + namespace);
|
||||
|
||||
final int revision = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println("Revision: " + revision);
|
||||
|
||||
// TODO: Remove redundant if condition code
|
||||
// Article or Redirect entry
|
||||
if (type == 65535) {
|
||||
|
||||
// System.out.println("MIMEType: " + type);
|
||||
|
||||
final int redirectIndex = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println("RedirectIndex: " + redirectIndex);
|
||||
|
||||
final String url = this.mReader.readString();
|
||||
// System.out.println("URL: " + url);
|
||||
|
||||
String title = this.mReader.readString();
|
||||
title = title.equals("") ? url : title;
|
||||
// System.out.println("Title: " + title);
|
||||
|
||||
return new RedirectEntry(type, namespace, revision, redirectIndex,
|
||||
url, title, (position - this.mFile.getUrlPtrPos()) / 8);
|
||||
|
||||
} else {
|
||||
|
||||
// System.out.println("MIMEType: " + mFile.getMIMEType(type));
|
||||
|
||||
final int clusterNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println("Cluster Number: " + clusterNumber);
|
||||
|
||||
final int blobNumber = this.mReader.readFourLittleEndianBytesValue(buffer);
|
||||
// System.out.println("Blob Number: " + blobNumber);
|
||||
|
||||
final String url = this.mReader.readString();
|
||||
// System.out.println("URL: " + url);
|
||||
|
||||
String title = this.mReader.readString();
|
||||
title = title.equals("") ? url : title;
|
||||
// System.out.println("Title: " + title);
|
||||
|
||||
// Parameter data ignored
|
||||
|
||||
return new ArticleEntry(type, namespace, revision, clusterNumber,
|
||||
blobNumber, url, title,
|
||||
(position - this.mFile.getUrlPtrPos()) / 8);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public ZIMFile getZIMFile() {
|
||||
return this.mFile;
|
||||
}
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Arunesh Mathur
|
||||
*
|
||||
* This file is a part of zimreader-java.
|
||||
*
|
||||
* zimreader-java is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License version 3.0 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* zimreader-java is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package org.openzim;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class ZIMTest {
|
||||
public static void main(final String[] args) {
|
||||
if(args.length!=2) {
|
||||
System.out.println("Usage: java ZIMTest <ZIM_FILE> <ARTICLE_NAME>");
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
// args[0] is the Zim File's location
|
||||
final ZIMFile file = new ZIMFile(args[0]);
|
||||
|
||||
// Associate the Zim File with a Reader
|
||||
final ZIMReader zReader = new ZIMReader(file);
|
||||
|
||||
try {
|
||||
// args[1] is the name of the articles that is
|
||||
// to be fetched
|
||||
System.out.println(zReader.getArticleData(args[1],'A').toString("utf-8"));
|
||||
} catch (final IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in new issue