added Zstandard compressed data decompression for ZIM files type 5

also: more generalization and performance enhancements
yacy · Oct 28, 2023 · b0ae660 · b0ae660
1 parent ad8ee3a
commit b0ae660
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 143 deletions.
diff --git a/ivy.xml b/ivy.xml
@@ -98,5 +98,6 @@
       <dependency org="org.hamcrest" name="hamcrest" rev="2.2" conf="test->default"/>
       <dependency org="org.hamcrest" name="hamcrest-core" rev="2.2" conf="test->default"/>
       <dependency org="org.hamcrest" name="hamcrest-library" rev="2.2" conf="test->default"/>
+      <dependency org="com.github.luben" name="zstd-jni" rev="1.5.5-6"></dependency>
     </dependencies>
 </ivy-module>
diff --git a/source/org/openzim/RandomAccessFileZIMInputStream.java b/source/org/openzim/RandomAccessFileZIMInputStream.java
@@ -18,6 +18,7 @@
 
 package org.openzim;
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.RandomAccessFile;
@@ -70,7 +71,7 @@ public static int toFourLittleEndianInteger(final byte[] buffer) { // TODO: make
             | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24));
     }
 
-    private static long toEightLittleEndianLong(final byte[] buffer) {
+    public static long toEightLittleEndianLong(final byte[] buffer) {
         return // cast to long required otherwise this is again an integer
               ((long)(buffer[0] & 0xFF)        | ((long)(buffer[1] & 0xFF) << 8)
             | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24)
@@ -85,13 +86,12 @@ public static void skipFully(final InputStream stream, final long bytes) throws
     // Reads characters from the current position into a String and stops when a
     // '\0' is encountered
     public String readZeroTerminatedString() throws IOException {
-        final StringBuilder sb = new StringBuilder();
-        int b = this.mRAFReader.read();
-        while (b != '\0') {
-            sb.append((char) b);
-            b = this.mRAFReader.read();
+        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+        int b;
+        while ((b = this.mRAFReader.read()) != '\0' && b != -1) {
+            buffer.write(b);
         }
-        return sb.toString();
+        return buffer.toString("UTF-8");
     }
 
     @Override

diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java
@@ -18,14 +18,15 @@
 
 package org.openzim;
 
-import java.io.ByteArrayOutputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.RandomAccessFile;
 import java.util.ArrayList;
 import java.util.List;
 
 import org.tukaani.xz.SingleXZInputStream;
+import com.github.luben.zstd.ZstdInputStream;
 
 /**
  * @author Arunesh Mathur
@@ -36,7 +37,9 @@
  *         naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
  *         change of Exception handling, 
  *         extension to more attributes as defined in spec (bugfix for mime type loading)
- *         bugfix to long parsing (prevented reading of large files)
+ *         bugfix to long parsing (prevented reading of large files),
+ *         added extended cluster size parsing
+ *         added ZStandard compression parsing (cluster type 5)
  */
 public class ZIMReader {
 
@@ -110,91 +113,53 @@ public ZIMFile getZIMFile() {
         return this.mFile;
     }
 
-    // get a URL list that is sorted by the urls
-    public List<String> getURLListByURL() throws IOException {
-
-        int i = 0, mimeType;
-
-        // The list that will eventually return the list of URL's
-        final ArrayList<String> returnList = new ArrayList<>();
+    public String getURLByURLOrder(int entryNumber) throws IOException {
 
         // Move to the spot where URL's are listed
-        this.mReader.seek(this.mFile.header_urlPtrPos);
-
-        for (i = 0; i < this.mFile.header_entryCount; i++) {
-
-            // The position of URL i
-            long pos = this.mReader.readEightLittleEndianBytesLong();
-
-            // Mark the current position that we need to return to
-            this.mReader.mark();
+        this.mReader.seek(this.mFile.header_urlPtrPos + 8L * entryNumber);
 
-            // Move to the position of URL i
-            this.mReader.seek(pos);
+        // The position of URL i
+        long pos = this.mReader.readEightLittleEndianBytesLong();
 
-            // Article or Redirect entry?
-            mimeType = this.mReader.readTwoLittleEndianBytesInt();
+        // Move to the position of URL i
+        this.mReader.seek(pos);
 
-            if (mimeType == 65535) {
-                this.mReader.seek(pos + 12);
-                returnList.add(this.mReader.readZeroTerminatedString());
-            } else {
-                this.mReader.seek(pos + 16);
-                returnList.add(this.mReader.readZeroTerminatedString());
-            }
+        // Article or Redirect entry?
+        int mimeType = this.mReader.readTwoLittleEndianBytesInt();
 
-            this.mReader.reset();
+        if (mimeType == 65535) {
+            this.mReader.seek(pos + 12);
+            return this.mReader.readZeroTerminatedString();
+        } else {
+            this.mReader.seek(pos + 16);
+            return this.mReader.readZeroTerminatedString();
         }
-
-        return returnList;
     }
 
-    // get a URL list that is sorted by the entry titles
-    public List<String> getURLListByTitle() throws IOException {
-
-        int i = 0, mimeType, articleNumber;
-
-        // The list that will eventually return the list of URL's
-        final ArrayList<String> returnList = new ArrayList<>();
-
-        // Get the UrlPtrPos or one time storage
-        long urlPtrPos = this.mFile.header_urlPtrPos;
+    public String getURLByTitleOrder(int entryNumber) throws IOException {
 
         // Move to the spot where URL's are listed
-        this.mReader.seek(this.mFile.header_titlePtrPos);
-
-        for (i = 0; i < this.mFile.header_entryCount; i++) {
+        this.mReader.seek(this.mFile.header_titlePtrPos + 8L * entryNumber);
 
-            // The articleNumber of the position of URL i
-            articleNumber = this.mReader.readFourLittleEndianBytesInt();
+        // The articleNumber of the position of URL i
+        int articleNumber = this.mReader.readFourLittleEndianBytesInt();
 
-            // Mark the current position that we need to return to
-            this.mReader.mark();
+        this.mReader.seek(this.mFile.header_urlPtrPos + (8L * (articleNumber)));
 
-            this.mReader.seek(urlPtrPos + (8L * (articleNumber)));
+        // The position of URL i
+        long pos = this.mReader.readEightLittleEndianBytesLong();
+        this.mReader.seek(pos);
 
-            // The position of URL i
-            long pos = this.mReader.readEightLittleEndianBytesLong();
-            this.mReader.seek(pos);
+        // Article or Redirect entry?
+        int mimeType = this.mReader.readTwoLittleEndianBytesInt();
 
-            // Article or Redirect entry?
-            mimeType = this.mReader.readTwoLittleEndianBytesInt();
-
-            if (mimeType == 65535) {
-                this.mReader.seek(pos + 12);
-                final String url = this.mReader.readZeroTerminatedString();
-                returnList.add(url);
-            } else {
-                this.mReader.seek(pos + 16);
-                final String url = this.mReader.readZeroTerminatedString();
-                returnList.add(url);
-            }
-
-            // Return to the marked position
-            this.mReader.reset();
+        if (mimeType == 65535) {
+            this.mReader.seek(pos + 12);
+            return this.mReader.readZeroTerminatedString();
+        } else {
+            this.mReader.seek(pos + 16);
+            return this.mReader.readZeroTerminatedString();
         }
-
-        return returnList;
     }
 
     // position must be the seek position for the title in the Title Pointer List
@@ -291,7 +256,7 @@ public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOExcept
         final ArticleEntry article = (ArticleEntry) directoryInfo;
 
         // Move to the cluster entry in the clusterPtrPos
-        this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8);
+        this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8L);
 
         // Read the location of the cluster
         final long clusterPos = this.mReader.readEightLittleEndianBytesLong();
@@ -302,78 +267,74 @@ public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOExcept
         // Read the first byte, for compression information
         final int compressionType = this.mReader.read();
 
-        // Reference declaration
-        int firstOffset, numberOfBlobs, offset1, offset2, location, differenceOffset;
-
         // Check the compression type that was read
-        if (compressionType == 1) {
-
-            // The first four bytes are the offset of the zeroth blob
-            firstOffset = this.mReader.readFourLittleEndianBytesInt();
-
-            // The number of blobs
-            numberOfBlobs = firstOffset / 4;
-
-            // The blobNumber has to be lesser than the numberOfBlobs
-            assert article.blob_number < numberOfBlobs;
-            if (article.blob_number == 0) {
-                // The first offset is what we read earlier
-                offset1 = firstOffset;
-            } else {
-                location = (article.blob_number - 1) * 4;
-                RandomAccessFileZIMInputStream.skipFully(this.mReader, location);
-                offset1 = this.mReader.readFourLittleEndianBytesInt();
-            }
-
-            offset2 = this.mReader.readFourLittleEndianBytesInt();
-            differenceOffset = offset2 - offset1;
-            byte[] entry = new byte[differenceOffset];
-            RandomAccessFileZIMInputStream.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2)));
-            this.mReader.read(entry, 0, differenceOffset);
-
-            return entry;
+        // type = 1 uncompressed
+        if (compressionType <= 1 || compressionType == 8 || compressionType == 9) {
+            boolean extended = compressionType > 1;
+            return readClusterEntry(this.mReader, article.blob_number, extended);
         }
         // 2 for zlib and 3 for bzip2 (removed)
 
         // LZMA2 compressed data
-        if (compressionType == 4) {
+        if (compressionType == 4 || compressionType == 12) {
+            boolean extended = compressionType == 12;
+            // Create a dictionary with size 40MiB, the zimlib uses this size while creating
+            SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 41943040);
+            return readClusterEntry(xzReader, article.blob_number, extended);
+        }
 
-            // Read the first 4 bytes to find out the number of artciles
-            byte[] buffer = new byte[4];
+        // Zstandard compressed data
+        if (compressionType == 5 || compressionType == 13) {
+            boolean extended = compressionType == 13;
+            ZstdInputStream zReader = new ZstdInputStream(this.mReader);
+            return readClusterEntry(zReader, article.blob_number, extended);
+        }
 
-            // Create a dictionary with size 40MiB, the zimlib uses this size while creating
-            SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 4194304);
+        return null;
+    }
 
-            // The first four bytes are the offset of the zeroth blob
-            firstOffset = this.mReader.readFourLittleEndianBytesInt();
+    private static byte[] readClusterEntry(InputStream is, int blob_number, boolean extended) throws IOException {
 
-            // The number of blobs
-            numberOfBlobs = firstOffset / 4;
+        // Read the first 4(8) bytes to find out the number of articles
+        byte[] buffer = new byte[extended ? 8 : 4];
 
-            // The blobNumber has to be lesser than the numberOfBlobs
-            assert article.blob_number < numberOfBlobs;
-            if (article.blob_number == 0) {
-                // The first offset is what we read earlier
-                offset1 = firstOffset;
-            } else {
-                location = (article.blob_number - 1) * 4;
-                RandomAccessFileZIMInputStream.skipFully(xzReader, location);
-                xzReader.read(buffer);
-                offset1 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
-            }
+        // The first four (eight) bytes are the offset of the zeroth blob
+        is.read(buffer);
+        long firstOffset = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
 
-            xzReader.read(buffer);
-            offset2 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
-            differenceOffset = offset2 - offset1;
-            byte[] entry = new byte[differenceOffset];
-            RandomAccessFileZIMInputStream.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2)));
-            xzReader.read(entry, 0, differenceOffset);
+        // The number of blobs can be computed by the offset
+        // the actual number is one less because there is one more offset entry than the actual number
+        // to identify the end of the last blob.
+        long numberOfBlobs1 = extended ? firstOffset / 8 : firstOffset / 4;
 
-            return entry;
+        // The blobNumber has to be lesser than the numberOfBlobs - 1
+        // the blob numbers start with 0 even if the documentation states it is "the first blob".
+        assert blob_number < numberOfBlobs1 - 1;
+        long offset1;
+        if (blob_number == 0) {
+            // The first offset is what we read earlier
+            offset1 = firstOffset;
+        } else {
+            // skip one less than required to get to the offset entry because the first entry is already read
+            RandomAccessFileZIMInputStream.skipFully(is, (blob_number - 1) * (extended ? 8 : 4));
+            is.read(buffer);
+            offset1 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
         }
-
-        // case 5: zstd compressed (missing!)
-        return null;
+        is.read(buffer);
+        long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
+        long blob_size = offset2 - offset1;
+        byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT
+        // we must do two skip steps: first to the end of the offset list and second to the start of the blob
+        // - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset
+        // - the number of offset entries that we alreay read now is article.blob_number + 2 (in any case at least 2)
+        // - the remaining number of offset entries to skip is therefore numberOfBlobs1 - (article.blob_number + 2)
+        // - the addon skip of number of bytes to the start of the entry is offset1 - firstoffset with firstoffset = 4 * numberOfBlobs1
+        // - the full skip length is 4 * (numberOfBlobs1 - (article.blob_number + 2)) + offset1 - 4 * numberOfBlobs1
+        //   = offset1 - 4 * (article.blob_number + 2)
+        RandomAccessFileZIMInputStream.skipFully(is, (offset1 - (extended ? 8 : 4) * (blob_number + 2)));
+        is.read(entry, 0, entry.length);
+
+        return entry;
     }
 
 }
diff --git a/source/org/openzim/ZIMTest.java b/source/org/openzim/ZIMTest.java
@@ -20,7 +20,6 @@
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
-import java.util.List;
 
 import org.openzim.ZIMReader.DirectoryEntry;
 
@@ -40,12 +39,10 @@ public static void main(final String[] args) {
             final ZIMReader zReader = new ZIMReader(file);
 
             // print a list of urls and titles
-            final List<String> urls = zReader.getURLListByURL();
-            final List<String> titles = zReader.getURLListByTitle();
-            int c = Math.min(10, titles.size());
+            int c = Math.min(10, file.header_entryCount);
             for (int i = 0; i < c; i++) {
-                System.out.println("URL by URL   " + i + ": " + urls.get(i));
-                System.out.println("URL by Title " + i + ": " + titles.get(i));
+                System.out.println("URL by URL   " + i + ": " + zReader.getURLByURLOrder(i));
+                System.out.println("URL by Title " + i + ": " + zReader.getURLByTitleOrder(i));
                 DirectoryEntry entry = zReader.getDirectoryInfo(i);
                 System.out.println("URL   by Pos " + i + ": " + entry.url);
                 System.out.println("Title by Pos " + i + ": " + entry.title);