From 675acc15d2e85032d72e1d851e15fb7a05ecb5f4 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Fri, 19 Jan 2024 00:19:49 +0100 Subject: [PATCH 1/6] Add XZ (LZMA) checksum repair utility --- dissect/util/compression/xz.py | 101 +++++++++++++++++++++++++++++++++ tests/test_compression.py | 16 ++++++ 2 files changed, 117 insertions(+) create mode 100644 dissect/util/compression/xz.py diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py new file mode 100644 index 0000000..f09a7be --- /dev/null +++ b/dissect/util/compression/xz.py @@ -0,0 +1,101 @@ +import io +from binascii import crc32 +from typing import BinaryIO + +from dissect.util.stream import OverlayStream + + +def repair_checksum(fh: BinaryIO) -> BinaryIO: + """Repair CRC32 checksums for all headers in an XZ stream. + + FortiOS XZ files have (on purpose) corrupt streams which they read using a modified ``xz`` binary. + The only thing changed are the CRC32 checksums, so partially parse the XZ file and fix all of them. + + References: + - https://tukaani.org/xz/xz-file-format-1.1.0.txt + - https://github.com/Rogdham/python-xz + + Args: + fh: A file-like object of an LZMA stream to repair. + """ + size = fh.seek(0, io.SEEK_END) + repaired = OverlayStream(fh, size) + fh.seek(0) + + header = fh.read(12) + # Check header magic + if header[:6] != b"\xfd7zXZ\x00": + raise ValueError("Not an XZ file") + + # Add correct header CRC32 + repaired.add(8, _crc32(header[6:8])) + + fh.seek(-12, io.SEEK_END) + footer = fh.read(12) + + # Check footer magic + if footer[10:12] != b"YZ": + raise ValueError("Not an XZ file") + + # Add correct footer CRC32 + repaired.add(fh.tell() - 12, _crc32(footer[4:10])) + + backward_size = (int.from_bytes(footer[4:8], "little") + 1) * 4 + fh.seek(-12 - backward_size, io.SEEK_END) + index = fh.read(backward_size) + + # Add correct index CRC32 + repaired.add(fh.tell() - 4, _crc32(index[:-4])) + + # Parse the index + isize, nb_records = _mbi(index[1:]) + index = index[1 + isize : -4] + records = [] + for _ in range(nb_records): + if not index: + raise ValueError("index size") + + isize, unpadded_size = _mbi(index) + if not unpadded_size: + raise ValueError("index record unpadded size") + + index = index[isize:] + if not index: + raise ValueError("index size") + + isize, uncompressed_size = _mbi(index) + if not uncompressed_size: + raise ValueError("index record uncompressed size") + + index = index[isize:] + records.append((unpadded_size, uncompressed_size)) + + block_start = size - 12 - backward_size + blocks_len = sum((unpadded_size + 3) & ~3 for unpadded_size, _ in records) + block_start -= blocks_len + + # Iterate over all blocks and add the correct block header CRC32 + for unpadded_size, _ in records: + fh.seek(block_start) + + block_header = fh.read(1) + block_header_size = (block_header[0] + 1) * 4 + block_header += fh.read(block_header_size - 1) + repaired.add(fh.tell() - 4, _crc32(block_header[:-4])) + + block_start += (unpadded_size + 3) & ~3 + + return repaired + + +def _mbi(data: bytes) -> tuple[int, int]: + value = 0 + for size, byte in enumerate(data): + value |= (byte & 0x7F) << (size * 7) + if not byte & 0x80: + return size + 1, value + raise ValueError("Invalid mbi") + + +def _crc32(data: bytes) -> bytes: + return int.to_bytes(crc32(data), 4, "little") diff --git a/tests/test_compression.py b/tests/test_compression.py index d602dbe..2480d44 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -1,4 +1,6 @@ import hashlib +import lzma +from io import BytesIO from dissect.util.compression import ( lz4, @@ -7,6 +9,7 @@ lzxpress, lzxpress_huffman, sevenbit, + xz, ) @@ -254,3 +257,16 @@ def test_sevenbit_decompress_wide(): result = sevenbit.decompress(bytes.fromhex("b796384d078ddf6db8bc3c9fa7df6e10bd3ca783e67479da7d06"), wide=True) target = "7-bit compression test string".encode("utf-16-le") assert result == target + + +def test_xz_repair_checksum(): + buf = BytesIO( + bytes.fromhex( + "fd377a585a000004deadbeef0200210116000000deadbeefe00fff001e5d003a" + "194ace2b0f238ce989a29cfeb182a4e814985366b771770233ca314836000000" + "2972e8fd62b18ee300013a8020000000deadbeefdeadbeef020000000004595a" + ) + ) + repaired = xz.repair_checksum(buf) + + assert lzma.decompress(repaired.read()) == b"test" * 1024 From 471c5dc5c30e4d6d50d792e50d5410d620cfdc85 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Mon, 22 Jan 2024 12:41:01 +0100 Subject: [PATCH 2/6] Process comments --- dissect/util/compression/xz.py | 41 +++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py index f09a7be..9c1bc44 100644 --- a/dissect/util/compression/xz.py +++ b/dissect/util/compression/xz.py @@ -4,6 +4,9 @@ from dissect.util.stream import OverlayStream +HEADER_FOOTER_SIZE = 12 +CRC_SIZE = 4 + def repair_checksum(fh: BinaryIO) -> BinaryIO: """Repair CRC32 checksums for all headers in an XZ stream. @@ -22,50 +25,52 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: repaired = OverlayStream(fh, size) fh.seek(0) - header = fh.read(12) + header = fh.read(HEADER_FOOTER_SIZE) # Check header magic - if header[:6] != b"\xfd7zXZ\x00": + magic = b"\xfd7zXZ\x00" + if header[: len(magic)] != magic: raise ValueError("Not an XZ file") # Add correct header CRC32 repaired.add(8, _crc32(header[6:8])) - fh.seek(-12, io.SEEK_END) - footer = fh.read(12) + footer_offset = fh.seek(-HEADER_FOOTER_SIZE, io.SEEK_END) + footer = fh.read(HEADER_FOOTER_SIZE) # Check footer magic - if footer[10:12] != b"YZ": + footer_magic = b"YZ" + if footer[HEADER_FOOTER_SIZE - len(footer_magic) : HEADER_FOOTER_SIZE] != footer_magic: raise ValueError("Not an XZ file") # Add correct footer CRC32 - repaired.add(fh.tell() - 12, _crc32(footer[4:10])) + repaired.add(footer_offset, _crc32(footer[CRC_SIZE : HEADER_FOOTER_SIZE - len(footer_magic)])) backward_size = (int.from_bytes(footer[4:8], "little") + 1) * 4 - fh.seek(-12 - backward_size, io.SEEK_END) + fh.seek(-HEADER_FOOTER_SIZE - backward_size, io.SEEK_END) index = fh.read(backward_size) # Add correct index CRC32 - repaired.add(fh.tell() - 4, _crc32(index[:-4])) + repaired.add(fh.tell() - CRC_SIZE, _crc32(index[:-CRC_SIZE])) # Parse the index - isize, nb_records = _mbi(index[1:]) + isize, num_records = _mbi(index[1:]) index = index[1 + isize : -4] records = [] - for _ in range(nb_records): + for _ in range(num_records): if not index: - raise ValueError("index size") + raise ValueError("Missing index size") isize, unpadded_size = _mbi(index) if not unpadded_size: - raise ValueError("index record unpadded size") + raise ValueError("Missing index record unpadded size") index = index[isize:] if not index: - raise ValueError("index size") + raise ValueError("Missing index size") isize, uncompressed_size = _mbi(index) if not uncompressed_size: - raise ValueError("index record uncompressed size") + raise ValueError("Missing index record uncompressed size") index = index[isize:] records.append((unpadded_size, uncompressed_size)) @@ -81,7 +86,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: block_header = fh.read(1) block_header_size = (block_header[0] + 1) * 4 block_header += fh.read(block_header_size - 1) - repaired.add(fh.tell() - 4, _crc32(block_header[:-4])) + repaired.add(fh.tell() - CRC_SIZE, _crc32(block_header[:-4])) block_start += (unpadded_size + 3) & ~3 @@ -89,6 +94,12 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: def _mbi(data: bytes) -> tuple[int, int]: + """Decode a multibyte integer. + + The encoding is similar to most other "varint" encodings. For each byte, the 7 least significant bits are used for + the integer value. The most significant bit is used to indicate if the integer continues in the next byte. + Bytes are ordered in little endian byte order, meaning the least significant byte comes first. + """ value = 0 for size, byte in enumerate(data): value |= (byte & 0x7F) << (size * 7) From c6dfc6f21e40d155046b528c8ba45762d2f3ab02 Mon Sep 17 00:00:00 2001 From: Erik Schamper <1254028+Schamper@users.noreply.github.com> Date: Mon, 22 Jan 2024 16:02:03 +0100 Subject: [PATCH 3/6] Apply suggestions from code review Co-authored-by: pyrco <105293448+pyrco@users.noreply.github.com> --- dissect/util/compression/xz.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py index 9c1bc44..eeacd8b 100644 --- a/dissect/util/compression/xz.py +++ b/dissect/util/compression/xz.py @@ -21,7 +21,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: Args: fh: A file-like object of an LZMA stream to repair. """ - size = fh.seek(0, io.SEEK_END) + file_size = fh.seek(0, io.SEEK_END) repaired = OverlayStream(fh, size) fh.seek(0) @@ -75,7 +75,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: index = index[isize:] records.append((unpadded_size, uncompressed_size)) - block_start = size - 12 - backward_size + block_start = file_size - HEADER_FOOTER_SIZE - backward_size blocks_len = sum((unpadded_size + 3) & ~3 for unpadded_size, _ in records) block_start -= blocks_len @@ -86,7 +86,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: block_header = fh.read(1) block_header_size = (block_header[0] + 1) * 4 block_header += fh.read(block_header_size - 1) - repaired.add(fh.tell() - CRC_SIZE, _crc32(block_header[:-4])) + repaired.add(fh.tell() - CRC_SIZE, _crc32(block_header[:-CRC_SIZE])) block_start += (unpadded_size + 3) & ~3 @@ -109,4 +109,4 @@ def _mbi(data: bytes) -> tuple[int, int]: def _crc32(data: bytes) -> bytes: - return int.to_bytes(crc32(data), 4, "little") + return int.to_bytes(crc32(data), CRC_SIZE, "little") From cad8549a1c222bd12f6e2d8431ca5fa0c13cf68c Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Mon, 22 Jan 2024 16:03:24 +0100 Subject: [PATCH 4/6] Update --- dissect/util/compression/xz.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py index eeacd8b..d6fb049 100644 --- a/dissect/util/compression/xz.py +++ b/dissect/util/compression/xz.py @@ -22,7 +22,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: fh: A file-like object of an LZMA stream to repair. """ file_size = fh.seek(0, io.SEEK_END) - repaired = OverlayStream(fh, size) + repaired = OverlayStream(fh, file_size) fh.seek(0) header = fh.read(HEADER_FOOTER_SIZE) @@ -32,7 +32,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: raise ValueError("Not an XZ file") # Add correct header CRC32 - repaired.add(8, _crc32(header[6:8])) + repaired.add(HEADER_FOOTER_SIZE - CRC_SIZE, _crc32(header[6:8])) footer_offset = fh.seek(-HEADER_FOOTER_SIZE, io.SEEK_END) footer = fh.read(HEADER_FOOTER_SIZE) From 4e7a609c5b16f48f819b6ff4df40bb414f1047bb Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Mon, 22 Jan 2024 16:05:57 +0100 Subject: [PATCH 5/6] Update --- dissect/util/compression/xz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py index d6fb049..4862445 100644 --- a/dissect/util/compression/xz.py +++ b/dissect/util/compression/xz.py @@ -32,7 +32,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: raise ValueError("Not an XZ file") # Add correct header CRC32 - repaired.add(HEADER_FOOTER_SIZE - CRC_SIZE, _crc32(header[6:8])) + repaired.add(HEADER_FOOTER_SIZE - CRC_SIZE, _crc32(header[len(magic) : HEADER_FOOTER_SIZE - CRC_SIZE])) footer_offset = fh.seek(-HEADER_FOOTER_SIZE, io.SEEK_END) footer = fh.read(HEADER_FOOTER_SIZE) From a027734e02cc75718e63a7cb7855f30a9db2fd34 Mon Sep 17 00:00:00 2001 From: Erik Schamper <1254028+Schamper@users.noreply.github.com> Date: Mon, 22 Jan 2024 16:11:26 +0100 Subject: [PATCH 6/6] Update dissect/util/compression/xz.py Co-authored-by: pyrco <105293448+pyrco@users.noreply.github.com> --- dissect/util/compression/xz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py index 4862445..826288b 100644 --- a/dissect/util/compression/xz.py +++ b/dissect/util/compression/xz.py @@ -32,7 +32,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO: raise ValueError("Not an XZ file") # Add correct header CRC32 - repaired.add(HEADER_FOOTER_SIZE - CRC_SIZE, _crc32(header[len(magic) : HEADER_FOOTER_SIZE - CRC_SIZE])) + repaired.add(fh.tell() - CRC_SIZE, _crc32(header[len(magic) : HEADER_FOOTER_SIZE - CRC_SIZE])) footer_offset = fh.seek(-HEADER_FOOTER_SIZE, io.SEEK_END) footer = fh.read(HEADER_FOOTER_SIZE)