diff --git a/ivy.xml b/ivy.xml index c5f13b585..61f9ee127 100644 --- a/ivy.xml +++ b/ivy.xml @@ -98,5 +98,6 @@ + diff --git a/source/org/openzim/RandomAccessFileZIMInputStream.java b/source/org/openzim/RandomAccessFileZIMInputStream.java index 4d81768c8..3b4f4d1ed 100644 --- a/source/org/openzim/RandomAccessFileZIMInputStream.java +++ b/source/org/openzim/RandomAccessFileZIMInputStream.java @@ -18,6 +18,7 @@ package org.openzim; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; @@ -70,7 +71,7 @@ public static int toFourLittleEndianInteger(final byte[] buffer) { // TODO: make | ((buffer[2] & 0xFF) << 16) | ((buffer[3] & 0xFF) << 24)); } - private static long toEightLittleEndianLong(final byte[] buffer) { + public static long toEightLittleEndianLong(final byte[] buffer) { return // cast to long required otherwise this is again an integer ((long)(buffer[0] & 0xFF) | ((long)(buffer[1] & 0xFF) << 8) | ((long)(buffer[2] & 0xFF) << 16) | ((long)(buffer[3] & 0xFF) << 24) @@ -85,13 +86,12 @@ public static void skipFully(final InputStream stream, final long bytes) throws // Reads characters from the current position into a String and stops when a // '\0' is encountered public String readZeroTerminatedString() throws IOException { - final StringBuilder sb = new StringBuilder(); - int b = this.mRAFReader.read(); - while (b != '\0') { - sb.append((char) b); - b = this.mRAFReader.read(); + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + int b; + while ((b = this.mRAFReader.read()) != '\0' && b != -1) { + buffer.write(b); } - return sb.toString(); + return buffer.toString("UTF-8"); } @Override diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index 4cecccacd..cf11d6342 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -18,14 +18,15 @@ package org.openzim; -import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.List; import org.tukaani.xz.SingleXZInputStream; +import com.github.luben.zstd.ZstdInputStream; /** * @author Arunesh Mathur @@ -36,7 +37,9 @@ * naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format, * change of Exception handling, * extension to more attributes as defined in spec (bugfix for mime type loading) - * bugfix to long parsing (prevented reading of large files) + * bugfix to long parsing (prevented reading of large files), + * added extended cluster size parsing + * added ZStandard compression parsing (cluster type 5) */ public class ZIMReader { @@ -110,91 +113,53 @@ public ZIMFile getZIMFile() { return this.mFile; } - // get a URL list that is sorted by the urls - public List getURLListByURL() throws IOException { - - int i = 0, mimeType; - - // The list that will eventually return the list of URL's - final ArrayList returnList = new ArrayList<>(); + public String getURLByURLOrder(int entryNumber) throws IOException { // Move to the spot where URL's are listed - this.mReader.seek(this.mFile.header_urlPtrPos); - - for (i = 0; i < this.mFile.header_entryCount; i++) { - - // The position of URL i - long pos = this.mReader.readEightLittleEndianBytesLong(); - - // Mark the current position that we need to return to - this.mReader.mark(); + this.mReader.seek(this.mFile.header_urlPtrPos + 8L * entryNumber); - // Move to the position of URL i - this.mReader.seek(pos); + // The position of URL i + long pos = this.mReader.readEightLittleEndianBytesLong(); - // Article or Redirect entry? - mimeType = this.mReader.readTwoLittleEndianBytesInt(); + // Move to the position of URL i + this.mReader.seek(pos); - if (mimeType == 65535) { - this.mReader.seek(pos + 12); - returnList.add(this.mReader.readZeroTerminatedString()); - } else { - this.mReader.seek(pos + 16); - returnList.add(this.mReader.readZeroTerminatedString()); - } + // Article or Redirect entry? + int mimeType = this.mReader.readTwoLittleEndianBytesInt(); - this.mReader.reset(); + if (mimeType == 65535) { + this.mReader.seek(pos + 12); + return this.mReader.readZeroTerminatedString(); + } else { + this.mReader.seek(pos + 16); + return this.mReader.readZeroTerminatedString(); } - - return returnList; } - // get a URL list that is sorted by the entry titles - public List getURLListByTitle() throws IOException { - - int i = 0, mimeType, articleNumber; - - // The list that will eventually return the list of URL's - final ArrayList returnList = new ArrayList<>(); - - // Get the UrlPtrPos or one time storage - long urlPtrPos = this.mFile.header_urlPtrPos; + public String getURLByTitleOrder(int entryNumber) throws IOException { // Move to the spot where URL's are listed - this.mReader.seek(this.mFile.header_titlePtrPos); - - for (i = 0; i < this.mFile.header_entryCount; i++) { + this.mReader.seek(this.mFile.header_titlePtrPos + 8L * entryNumber); - // The articleNumber of the position of URL i - articleNumber = this.mReader.readFourLittleEndianBytesInt(); + // The articleNumber of the position of URL i + int articleNumber = this.mReader.readFourLittleEndianBytesInt(); - // Mark the current position that we need to return to - this.mReader.mark(); + this.mReader.seek(this.mFile.header_urlPtrPos + (8L * (articleNumber))); - this.mReader.seek(urlPtrPos + (8L * (articleNumber))); + // The position of URL i + long pos = this.mReader.readEightLittleEndianBytesLong(); + this.mReader.seek(pos); - // The position of URL i - long pos = this.mReader.readEightLittleEndianBytesLong(); - this.mReader.seek(pos); + // Article or Redirect entry? + int mimeType = this.mReader.readTwoLittleEndianBytesInt(); - // Article or Redirect entry? - mimeType = this.mReader.readTwoLittleEndianBytesInt(); - - if (mimeType == 65535) { - this.mReader.seek(pos + 12); - final String url = this.mReader.readZeroTerminatedString(); - returnList.add(url); - } else { - this.mReader.seek(pos + 16); - final String url = this.mReader.readZeroTerminatedString(); - returnList.add(url); - } - - // Return to the marked position - this.mReader.reset(); + if (mimeType == 65535) { + this.mReader.seek(pos + 12); + return this.mReader.readZeroTerminatedString(); + } else { + this.mReader.seek(pos + 16); + return this.mReader.readZeroTerminatedString(); } - - return returnList; } // position must be the seek position for the title in the Title Pointer List @@ -291,7 +256,7 @@ public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOExcept final ArticleEntry article = (ArticleEntry) directoryInfo; // Move to the cluster entry in the clusterPtrPos - this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8); + this.mReader.seek(this.mFile.header_clusterPtrPos + article.cluster_number * 8L); // Read the location of the cluster final long clusterPos = this.mReader.readEightLittleEndianBytesLong(); @@ -302,78 +267,74 @@ public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOExcept // Read the first byte, for compression information final int compressionType = this.mReader.read(); - // Reference declaration - int firstOffset, numberOfBlobs, offset1, offset2, location, differenceOffset; - // Check the compression type that was read - if (compressionType == 1) { - - // The first four bytes are the offset of the zeroth blob - firstOffset = this.mReader.readFourLittleEndianBytesInt(); - - // The number of blobs - numberOfBlobs = firstOffset / 4; - - // The blobNumber has to be lesser than the numberOfBlobs - assert article.blob_number < numberOfBlobs; - if (article.blob_number == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - location = (article.blob_number - 1) * 4; - RandomAccessFileZIMInputStream.skipFully(this.mReader, location); - offset1 = this.mReader.readFourLittleEndianBytesInt(); - } - - offset2 = this.mReader.readFourLittleEndianBytesInt(); - differenceOffset = offset2 - offset1; - byte[] entry = new byte[differenceOffset]; - RandomAccessFileZIMInputStream.skipFully(this.mReader, (offset1 - 4 * (article.blob_number + 2))); - this.mReader.read(entry, 0, differenceOffset); - - return entry; + // type = 1 uncompressed + if (compressionType <= 1 || compressionType == 8 || compressionType == 9) { + boolean extended = compressionType > 1; + return readClusterEntry(this.mReader, article.blob_number, extended); } // 2 for zlib and 3 for bzip2 (removed) // LZMA2 compressed data - if (compressionType == 4) { + if (compressionType == 4 || compressionType == 12) { + boolean extended = compressionType == 12; + // Create a dictionary with size 40MiB, the zimlib uses this size while creating + SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 41943040); + return readClusterEntry(xzReader, article.blob_number, extended); + } - // Read the first 4 bytes to find out the number of artciles - byte[] buffer = new byte[4]; + // Zstandard compressed data + if (compressionType == 5 || compressionType == 13) { + boolean extended = compressionType == 13; + ZstdInputStream zReader = new ZstdInputStream(this.mReader); + return readClusterEntry(zReader, article.blob_number, extended); + } - // Create a dictionary with size 40MiB, the zimlib uses this size while creating - SingleXZInputStream xzReader= new SingleXZInputStream(this.mReader, 4194304); + return null; + } - // The first four bytes are the offset of the zeroth blob - firstOffset = this.mReader.readFourLittleEndianBytesInt(); + private static byte[] readClusterEntry(InputStream is, int blob_number, boolean extended) throws IOException { - // The number of blobs - numberOfBlobs = firstOffset / 4; + // Read the first 4(8) bytes to find out the number of articles + byte[] buffer = new byte[extended ? 8 : 4]; - // The blobNumber has to be lesser than the numberOfBlobs - assert article.blob_number < numberOfBlobs; - if (article.blob_number == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - location = (article.blob_number - 1) * 4; - RandomAccessFileZIMInputStream.skipFully(xzReader, location); - xzReader.read(buffer); - offset1 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); - } + // The first four (eight) bytes are the offset of the zeroth blob + is.read(buffer); + long firstOffset = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); - xzReader.read(buffer); - offset2 = RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); - differenceOffset = offset2 - offset1; - byte[] entry = new byte[differenceOffset]; - RandomAccessFileZIMInputStream.skipFully(xzReader, (offset1 - 4 * (article.blob_number + 2))); - xzReader.read(entry, 0, differenceOffset); + // The number of blobs can be computed by the offset + // the actual number is one less because there is one more offset entry than the actual number + // to identify the end of the last blob. + long numberOfBlobs1 = extended ? firstOffset / 8 : firstOffset / 4; - return entry; + // The blobNumber has to be lesser than the numberOfBlobs - 1 + // the blob numbers start with 0 even if the documentation states it is "the first blob". + assert blob_number < numberOfBlobs1 - 1; + long offset1; + if (blob_number == 0) { + // The first offset is what we read earlier + offset1 = firstOffset; + } else { + // skip one less than required to get to the offset entry because the first entry is already read + RandomAccessFileZIMInputStream.skipFully(is, (blob_number - 1) * (extended ? 8 : 4)); + is.read(buffer); + offset1 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); } - - // case 5: zstd compressed (missing!) - return null; + is.read(buffer); + long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); + long blob_size = offset2 - offset1; + byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT + // we must do two skip steps: first to the end of the offset list and second to the start of the blob + // - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset + // - the number of offset entries that we alreay read now is article.blob_number + 2 (in any case at least 2) + // - the remaining number of offset entries to skip is therefore numberOfBlobs1 - (article.blob_number + 2) + // - the addon skip of number of bytes to the start of the entry is offset1 - firstoffset with firstoffset = 4 * numberOfBlobs1 + // - the full skip length is 4 * (numberOfBlobs1 - (article.blob_number + 2)) + offset1 - 4 * numberOfBlobs1 + // = offset1 - 4 * (article.blob_number + 2) + RandomAccessFileZIMInputStream.skipFully(is, (offset1 - (extended ? 8 : 4) * (blob_number + 2))); + is.read(entry, 0, entry.length); + + return entry; } } diff --git a/source/org/openzim/ZIMTest.java b/source/org/openzim/ZIMTest.java index ea77c3b90..cb8a28499 100644 --- a/source/org/openzim/ZIMTest.java +++ b/source/org/openzim/ZIMTest.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.util.List; import org.openzim.ZIMReader.DirectoryEntry; @@ -40,12 +39,10 @@ public static void main(final String[] args) { final ZIMReader zReader = new ZIMReader(file); // print a list of urls and titles - final List urls = zReader.getURLListByURL(); - final List titles = zReader.getURLListByTitle(); - int c = Math.min(10, titles.size()); + int c = Math.min(10, file.header_entryCount); for (int i = 0; i < c; i++) { - System.out.println("URL by URL " + i + ": " + urls.get(i)); - System.out.println("URL by Title " + i + ": " + titles.get(i)); + System.out.println("URL by URL " + i + ": " + zReader.getURLByURLOrder(i)); + System.out.println("URL by Title " + i + ": " + zReader.getURLByTitleOrder(i)); DirectoryEntry entry = zReader.getDirectoryInfo(i); System.out.println("URL by Pos " + i + ": " + entry.url); System.out.println("Title by Pos " + i + ": " + entry.title);