You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/org/openzim/ZIMReader.java

298 lines
12 KiB

/*
* Copyright (C) 2011 Arunesh Mathur
*
* This file is a part of zimreader-java.
*
* zimreader-java is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3.0 as
* published by the Free Software Foundation.
*
* zimreader-java is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with zimreader-java. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openzim;
import java.io.IOException;
import java.io.InputStream;
import org.tukaani.xz.SingleXZInputStream;
import com.github.luben.zstd.ZstdInputStream;
/**
* @author Arunesh Mathur
* A ZIMReader that reads data from the ZIMFile
*
* @author Michael Christen
* Proof-Reading, unclustering, refactoring,
* naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
* change of Exception handling,
* extension to more attributes as defined in spec (bugfix for mime type loading)
* bugfix to long parsing (prevented reading of large files),
* added extended cluster size parsing
* added ZStandard compression parsing (cluster type 5)
*/
public class ZIMReader {
private final ZIMFile mFile;
public static abstract class DirectoryEntry {
public final int mimetype;
public final char namespace;
public final int cluster_number;
public final String url;
public final String title;
public final long urlListindex;
public DirectoryEntry(
final int mimeType, final char namespace,
final int cluster_number,
final String url, final String title,
final long index) {
this.mimetype = mimeType;
this.namespace = namespace;
this.cluster_number = cluster_number;
this.url = url;
this.title = title;
this.urlListindex = index;
}
}
public static class ArticleEntry extends DirectoryEntry {
public final int cluster_number;
public final int blob_number;
public ArticleEntry(
final int mimeType, final char namespace,
final int cluster_number, final int blob_number,
final String url, final String title,
final long urlListindex) {
super(mimeType, namespace, cluster_number, url, title, urlListindex);
this.cluster_number = cluster_number;
this.blob_number = blob_number;
}
}
public static class RedirectEntry extends DirectoryEntry {
public final long redirect_index;
public RedirectEntry(final int mimeType, final char namespace,
final long redirect_index, final String url, final String title,
final long urlListindex) {
super(mimeType, namespace, 0, url, title, urlListindex);
this.redirect_index = redirect_index;
}
}
public ZIMReader(final ZIMFile file) {
this.mFile = file;
}
public ZIMFile getZIMFile() {
return this.mFile;
}
public String getURLByURLOrder(final int entryNumber) throws IOException {
// The position of URL i
long pos = this.mFile.getURLPtr(entryNumber);
// Move to the position of URL i
this.mFile.mReader.seek(pos);
// Article or Redirect entry?
int mimeType = this.mFile.mReader.readTwoLittleEndianBytesInt();
if (mimeType == 65535) {
this.mFile.mReader.seek(pos + 12);
return this.mFile.mReader.readZeroTerminatedString();
} else {
this.mFile.mReader.seek(pos + 16);
return this.mFile.mReader.readZeroTerminatedString();
}
}
public String getURLByTitleOrder(final int entryNumber) throws IOException {
// The articleNumber of the position of URL i
int articleNumber = this.mFile.getTitlePtr(entryNumber);
long pos = this.mFile.getURLPtr(articleNumber);
this.mFile.mReader.seek(pos);
// Article or Redirect entry?
int mimeType = this.mFile.mReader.readTwoLittleEndianBytesInt();
if (mimeType == 65535) {
this.mFile.mReader.seek(pos + 12);
return this.mFile.mReader.readZeroTerminatedString();
} else {
this.mFile.mReader.seek(pos + 16);
return this.mFile.mReader.readZeroTerminatedString();
}
}
public DirectoryEntry getDirectoryInfo(final int entryNumber) throws IOException {
// Get value of article at index
int pointer_to_the_URL_pointer = this.mFile.getTitlePtr(entryNumber);
// Get value of article in urlPtrPos
long pointer_to_the_directory_entry = this.mFile.getURLPtr(pointer_to_the_URL_pointer);
// Go to the location of the directory entry
this.mFile.mReader.seek(pointer_to_the_directory_entry);
// read the Content Entry
final int type = this.mFile.mReader.readTwoLittleEndianBytesInt(); // 2, 0xffff for redirect
this.mFile.mReader.read(); // 1, ignore, parameter length not used
final char namespace = (char) this.mFile.mReader.read(); // 1
this.mFile.mReader.readFourLittleEndianBytesInt(); // 4, ignore, revision not used
// Article or Redirect entry
if (type == 65535) {
final int redirectIndex = this.mFile.mReader.readFourLittleEndianBytesInt();
final String url = this.mFile.mReader.readZeroTerminatedString();
String title = this.mFile.mReader.readZeroTerminatedString();
title = title.equals("") ? url : title;
return new RedirectEntry(type, namespace, redirectIndex, url, title, entryNumber);
} else {
final int cluster_number = this.mFile.mReader.readFourLittleEndianBytesInt(); // 4
final int blob_number = this.mFile.mReader.readFourLittleEndianBytesInt(); // 4
final String url = this.mFile.mReader.readZeroTerminatedString(); // zero terminated
String title = this.mFile.mReader.readZeroTerminatedString(); // zero terminated
title = title.equals("") ? url : title;
return new ArticleEntry(type, namespace, cluster_number, blob_number, url, title, entryNumber);
}
}
// Gives the minimum required information needed for the given articleName
// This makes a binary search on the article name entry list.
public DirectoryEntry getDirectoryInfo(final char namespace, String articleName) throws IOException {
DirectoryEntry entry;
String cmpStr;
final int numberOfArticles = this.mFile.header_entryCount;
int beg = 0, end = numberOfArticles, mid;
articleName = namespace + "/" + articleName;
while (beg <= end) {
mid = beg + ((end - beg) / 2);
entry = getDirectoryInfo(mid);
if (entry == null) {
return null;
}
cmpStr = entry.namespace + "/" + entry.title;
if (articleName.compareTo(cmpStr) < 0) {
end = mid - 1;
} else if (articleName.compareTo(cmpStr) > 0) {
beg = mid + 1;
} else {
return entry;
}
}
return null;
}
public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {
// fail fast
if (directoryInfo == null) return null;
if (directoryInfo.getClass() != ArticleEntry.class) return null;
// This is now an article, so thus we can cast to ArticleEntry
final ArticleEntry article = (ArticleEntry) directoryInfo;
// Read the location of the cluster
final long clusterPos = this.mFile.geClusterPtr(article.cluster_number);
// Move to the cluster
this.mFile.mReader.seek(clusterPos);
// Read the first byte, for compression information
final int compressionType = this.mFile.mReader.read();
// Check the compression type that was read
// type = 1 uncompressed
if (compressionType <= 1 || compressionType == 8 || compressionType == 9) {
boolean extended = compressionType > 1;
return readClusterEntry(this.mFile.mReader, article.blob_number, extended);
}
// 2 for zlib and 3 for bzip2 (removed)
// LZMA2 compressed data
if (compressionType == 4 || compressionType == 12) {
boolean extended = compressionType == 12;
// Create a dictionary with size 40MiB, the zimlib uses this size while creating
SingleXZInputStream xzReader= new SingleXZInputStream(this.mFile.mReader, 41943040);
return readClusterEntry(xzReader, article.blob_number, extended);
}
// Zstandard compressed data
if (compressionType == 5 || compressionType == 13) {
boolean extended = compressionType == 13;
ZstdInputStream zReader = new ZstdInputStream(this.mFile.mReader);
return readClusterEntry(zReader, article.blob_number, extended);
}
return null;
}
private static byte[] readClusterEntry(InputStream is, int blob_number, boolean extended) throws IOException {
// Read the first 4(8) bytes to find out the number of articles
byte[] buffer = new byte[extended ? 8 : 4];
// The first four (eight) bytes are the offset of the zeroth blob
is.read(buffer);
long firstOffset = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
// The number of blobs can be computed by the offset
// the actual number is one less because there is one more offset entry than the actual number
// to identify the end of the last blob.
long numberOfBlobs1 = extended ? firstOffset / 8 : firstOffset / 4;
// The blobNumber has to be lesser than the numberOfBlobs - 1
// the blob numbers start with 0 even if the documentation states it is "the first blob".
assert blob_number < numberOfBlobs1 - 1;
long offset1;
if (blob_number == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
// skip one less than required to get to the offset entry because the first entry is already read
RandomAccessFileZIMInputStream.skipFully(is, (blob_number - 1) * (extended ? 8 : 4));
is.read(buffer);
offset1 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
}
is.read(buffer);
long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
long blob_size = offset2 - offset1;
byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT
// we must do two skip steps: first to the end of the offset list and second to the start of the blob
// - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset
// - the number of offset entries that we alreay read now is article.blob_number + 2 (in any case at least 2)
// - the remaining number of offset entries to skip is therefore numberOfBlobs1 - (article.blob_number + 2)
// - the addon skip of number of bytes to the start of the entry is offset1 - firstoffset with firstoffset = 4 * numberOfBlobs1
// - the full skip length is 4 * (numberOfBlobs1 - (article.blob_number + 2)) + offset1 - 4 * numberOfBlobs1
// = offset1 - 4 * (article.blob_number + 2)
RandomAccessFileZIMInputStream.skipFully(is, (offset1 - (extended ? 8 : 4) * (blob_number + 2)));
RandomAccessFileZIMInputStream.readFully(is, entry);
return entry;
}
}