diff --git a/ivy.xml b/ivy.xml
index c5f13b585..61f9ee127 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -98,5 +98,6 @@
+
diff --git a/source/org/openzim/RandomAccessFileZIMInputStream.java b/source/org/openzim/RandomAccessFileZIMInputStream.java
index 4d81768c8..3b4f4d1ed 100644
--- a/source/org/openzim/RandomAccessFileZIMInputStream.java
+++ b/source/org/openzim/RandomAccessFileZIMInputStream.java
@@ -18,6 +18,7 @@
package org.openzim;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
@@ -70,7 +71,7 @@ public static int toFourLittleEndianInteger(final byte[] buffer) { // TODO: make
| ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
}
- private static long toEightLittleEndianLong(final byte[] buffer) {
+ public static long toEightLittleEndianLong(final byte[] buffer) {
return // cast to long required otherwise this is again an integer
((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8)
| ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24)
@@ -85,13 +86,12 @@ public static void skipFully(final InputStream stream, final long bytes) throws
// Reads characters from the current position into a String and stops when a
// '\0' is encountered
public String readZeroTerminatedString() throws IOException {
- final StringBuilder sb = new StringBuilder();
- int b = this.mRAFReader.read();
- while (b != '\0') {
- sb.append((char) b);
- b = this.mRAFReader.read();
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ int b;
+ while ((b = this.mRAFReader.read()) != '\0' && b != -1) {
+ buffer.write(b);
}
- return sb.toString();
+ return buffer.toString("UTF-8");
}
@Override
diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java
index 4cecccacd..cf11d6342 100644
--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@@ -18,14 +18,15 @@
package org.openzim;
-import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.InputStream;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.List;
import org.tukaani.xz.SingleXZInputStream;
+import com.github.luben.zstd.ZstdInputStream;
/**
* @author Arunesh Mathur
@@ -36,7 +37,9 @@
* naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
* change of Exception handling,
* extension to more attributes as defined in spec (bugfix for mime type loading)
- * bugfix to long parsing (prevented reading of large files)
+ * bugfix to long parsing (prevented reading of large files),
+ * added extended cluster size parsing
+ * added ZStandard compression parsing (cluster type 5)
*/
public class ZIMReader {
@@ -110,91 +113,53 @@ public ZIMFile getZIMFile() {
return this.mFile;
}
- // get a URL list that is sorted by the urls
- public List getURLListByURL() throws IOException {
-
- int i = 0, mimeType;
-
- // The list that will eventually return the list of URL's
- final ArrayList returnList = new ArrayList<>();
+ public String getURLByURLOrder(int entryNumber) throws IOException {
// Move to the spot where URL's are listed
- this.mReader.seek(this.mFile.header_urlPtrPos);
-
- for (i = 0; i < this.mFile.header_entryCount; i++) {
-
- // The position of URL i
- long pos = this.mReader.readEightLittleEndianBytesLong();
-
- // Mark the current position that we need to return to
- this.mReader.mark();
+ this.mReader.seek(this.mFile.header_urlPtrPos + 8L * entryNumber);
- // Move to the position of URL i
- this.mReader.seek(pos);
+ // The position of URL i
+ long pos = this.mReader.readEightLittleEndianBytesLong();
- // Article or Redirect entry?
- mimeType = this.mReader.readTwoLittleEndianBytesInt();
+ // Move to the position of URL i
+ this.mReader.seek(pos);
- if (mimeType == 65535) {
- this.mReader.seek(pos + 12);
- returnList.add(this.mReader.readZeroTerminatedString());
- } else {
- this.mReader.seek(pos + 16);
- returnList.add(this.mReader.readZeroTerminatedString());
- }
+ // Article or Redirect entry?
+ int mimeType = this.mReader.readTwoLittleEndianBytesInt();
- this.mReader.reset();
+ if (mimeType == 65535) {
+ this.mReader.seek(pos + 12);
+ return this.mReader.readZeroTerminatedString();
+ } else {
+ this.mReader.seek(pos + 16);
+ return this.mReader.readZeroTerminatedString();
}
-
- return returnList;
}
- // get a URL list that is sorted by the entry titles
- public List getURLListByTitle() throws IOException {
-
- int i = 0, mimeType, articleNumber;
-
- // The list that will eventually return the list of URL's
- final ArrayList returnList = new ArrayList<>();
-
- // Get the UrlPtrPos or one time storage
- long urlPtrPos = this.mFile.header_urlPtrPos;
+ public String getURLByTitleOrder(int entryNumber) throws IOException {
// Move to the spot where URL's are listed
- this.mReader.seek(this.mFile.header_titlePtrPos);
-
- for (i = 0; i < this.mFile.header_entryCount; i++) {
+ this.mReader.seek(this.mFile.header_titlePtrPos + 8L * entryNumber);
- // The articleNumber of the position of URL i
- articleNumber = this.mReader.readFourLittleEndianBytesInt();
+ // The articleNumber of the position of URL i
+ int articleNumber = this.mReader.readFourLittleEndianBytesInt();
- // Mark the current position that we need to return to
- this.mReader.mark();
+ this.mReader.seek(this.mFile.header_urlPtrPos + (8L * (articleNumber)));
- this.mReader.seek(urlPtrPos + (8L * (articleNumber)));
+ // The position of URL i
+ long pos = this.mReader.readEightLittleEndianBytesLong();
+ this.mReader.seek(pos);
- // The position of URL i
- long pos = this.mReader.readEightLittleEndianBytesLong();
- this.mReader.seek(pos);
+ // Article or Redirect entry?
+ int mimeType = this.mReader.readTwoLittleEndianBytesInt();
- // Article or Redirect entry?
- mimeType = this.mReader.readTwoLittleEndianBytesInt();
-
- if (mimeType == 65535) {
- this.mReader.seek(pos + 12);
- final String url = this.mReader.readZeroTerminatedString();
- returnList.add(url);
- } else {
- this.mReader.seek(pos + 16);
- final String url = this.mReader.readZeroTerminatedString();
- returnList.add(url);
- }
-
- // Return to the marked position
- this.mReader.reset();
+ if (mimeType == 65535) {
+ this.mReader.seek(pos + 12);
+ return this.mReader.readZeroTerminatedString();
+ } else {
+ this.mReader.seek(pos + 16);
+ return this.mReader.readZeroTerminatedString();
}
-
- return returnList;
}
// position must be the seek position for the title in the Title Pointer List
@@ -291,7 +256,7 @@ public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOExcept
final ArticleEntry article = (ArticleEntry) directoryInfo;
// Move to the cluster entry in the clusterPtrPos
- this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8);
+ this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8L);
// Read the location of the cluster
final long clusterPos = this.mReader.readEightLittleEndianBytesLong();
@@ -302,78 +267,74 @@ public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOExcept
// Read the first byte, for compression information
final int compressionType = this.mReader.read();
- // Reference declaration
- int firstOffset, numberOfBlobs, offset1, offset2, location, differenceOffset;
-
// Check the compression type that was read
- if (compressionType == 1) {
-
- // The first four bytes are the offset of the zeroth blob
- firstOffset = this.mReader.readFourLittleEndianBytesInt();
-
- // The number of blobs
- numberOfBlobs = firstOffset / 4;
-
- // The blobNumber has to be lesser than the numberOfBlobs
- assert article.blob_number < numberOfBlobs;
- if (article.blob_number == 0) {
- // The first offset is what we read earlier
- offset1 = firstOffset;
- } else {
- location = (article.blob_number - 1) * 4;
- RandomAccessFileZIMInputStream.skipFully(this.mReader, location);
- offset1 = this.mReader.readFourLittleEndianBytesInt();
- }
-
- offset2 = this.mReader.readFourLittleEndianBytesInt();
- differenceOffset = offset2 - offset1;
- byte[] entry = new byte[differenceOffset];
- RandomAccessFileZIMInputStream.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
- this.mReader.read(entry, 0, differenceOffset);
-
- return entry;
+ // type = 1 uncompressed
+ if (compressionType <= 1 || compressionType == 8 || compressionType == 9) {
+ boolean extended = compressionType > 1;
+ return readClusterEntry(this.mReader, article.blob_number, extended);
}
// 2 for zlib and 3 for bzip2 (removed)
// LZMA2 compressed data
- if (compressionType == 4) {
+ if (compressionType == 4 || compressionType == 12) {
+ boolean extended = compressionType == 12;
+ // Create a dictionary with size 40MiB, the zimlib uses this size while creating
+ SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 41943040);
+ return readClusterEntry(xzReader, article.blob_number, extended);
+ }
- // Read the first 4 bytes to find out the number of artciles
- byte[] buffer = new byte[4];
+ // Zstandard compressed data
+ if (compressionType == 5 || compressionType == 13) {
+ boolean extended = compressionType == 13;
+ ZstdInputStream zReader = new ZstdInputStream(this.mReader);
+ return readClusterEntry(zReader, article.blob_number, extended);
+ }
- // Create a dictionary with size 40MiB, the zimlib uses this size while creating
- SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 4194304);
+ return null;
+ }
- // The first four bytes are the offset of the zeroth blob
- firstOffset = this.mReader.readFourLittleEndianBytesInt();
+ private static byte[] readClusterEntry(InputStream is, int blob_number, boolean extended) throws IOException {
- // The number of blobs
- numberOfBlobs = firstOffset / 4;
+ // Read the first 4(8) bytes to find out the number of articles
+ byte[] buffer = new byte[extended ? 8 : 4];
- // The blobNumber has to be lesser than the numberOfBlobs
- assert article.blob_number < numberOfBlobs;
- if (article.blob_number == 0) {
- // The first offset is what we read earlier
- offset1 = firstOffset;
- } else {
- location = (article.blob_number - 1) * 4;
- RandomAccessFileZIMInputStream.skipFully(xzReader, location);
- xzReader.read(buffer);
- offset1 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
- }
+ // The first four (eight) bytes are the offset of the zeroth blob
+ is.read(buffer);
+ long firstOffset = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
- xzReader.read(buffer);
- offset2 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
- differenceOffset = offset2 - offset1;
- byte[] entry = new byte[differenceOffset];
- RandomAccessFileZIMInputStream.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
- xzReader.read(entry, 0, differenceOffset);
+ // The number of blobs can be computed by the offset
+ // the actual number is one less because there is one more offset entry than the actual number
+ // to identify the end of the last blob.
+ long numberOfBlobs1 = extended ? firstOffset / 8 : firstOffset / 4;
- return entry;
+ // The blobNumber has to be lesser than the numberOfBlobs - 1
+ // the blob numbers start with 0 even if the documentation states it is "the first blob".
+ assert blob_number < numberOfBlobs1 - 1;
+ long offset1;
+ if (blob_number == 0) {
+ // The first offset is what we read earlier
+ offset1 = firstOffset;
+ } else {
+ // skip one less than required to get to the offset entry because the first entry is already read
+ RandomAccessFileZIMInputStream.skipFully(is, (blob_number - 1) * (extended ? 8 : 4));
+ is.read(buffer);
+ offset1 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
}
-
- // case 5: zstd compressed (missing!)
- return null;
+ is.read(buffer);
+ long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
+ long blob_size = offset2 - offset1;
+ byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT
+ // we must do two skip steps: first to the end of the offset list and second to the start of the blob
+ // - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset
+ // - the number of offset entries that we alreay read now is article.blob_number + 2 (in any case at least 2)
+ // - the remaining number of offset entries to skip is therefore numberOfBlobs1 - (article.blob_number + 2)
+ // - the addon skip of number of bytes to the start of the entry is offset1 - firstoffset with firstoffset = 4 * numberOfBlobs1
+ // - the full skip length is 4 * (numberOfBlobs1 - (article.blob_number + 2)) + offset1 - 4 * numberOfBlobs1
+ // = offset1 - 4 * (article.blob_number + 2)
+ RandomAccessFileZIMInputStream.skipFully(is, (offset1 - (extended ? 8 : 4) * (blob_number + 2)));
+ is.read(entry, 0, entry.length);
+
+ return entry;
}
}
diff --git a/source/org/openzim/ZIMTest.java b/source/org/openzim/ZIMTest.java
index ea77c3b90..cb8a28499 100644
--- a/source/org/openzim/ZIMTest.java
+++ b/source/org/openzim/ZIMTest.java
@@ -20,7 +20,6 @@
import java.io.IOException;
import java.nio.charset.StandardCharsets;
-import java.util.List;
import org.openzim.ZIMReader.DirectoryEntry;
@@ -40,12 +39,10 @@ public static void main(final String[] args) {
final ZIMReader zReader = new ZIMReader(file);
// print a list of urls and titles
- final List urls = zReader.getURLListByURL();
- final List titles = zReader.getURLListByTitle();
- int c = Math.min(10, titles.size());
+ int c = Math.min(10, file.header_entryCount);
for (int i = 0; i < c; i++) {
- System.out.println("URL by URL " + i + ": " + urls.get(i));
- System.out.println("URL by Title " + i + ": " + titles.get(i));
+ System.out.println("URL by URL " + i + ": " + zReader.getURLByURLOrder(i));
+ System.out.println("URL by Title " + i + ": " + zReader.getURLByTitleOrder(i));
DirectoryEntry entry = zReader.getDirectoryInfo(i);
System.out.println("URL by Pos " + i + ": " + entry.url);
System.out.println("Title by Pos " + i + ": " + entry.title);