From 675acc15d2e85032d72e1d851e15fb7a05ecb5f4 Mon Sep 17 00:00:00 2001
From: Schamper <1254028+Schamper@users.noreply.github.com>
Date: Fri, 19 Jan 2024 00:19:49 +0100
Subject: [PATCH 1/6] Add XZ (LZMA) checksum repair utility

---
 dissect/util/compression/xz.py | 101 +++++++++++++++++++++++++++++++++
 tests/test_compression.py      |  16 ++++++
 2 files changed, 117 insertions(+)
 create mode 100644 dissect/util/compression/xz.py

diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py
new file mode 100644
index 0000000..f09a7be
--- /dev/null
+++ b/dissect/util/compression/xz.py
@@ -0,0 +1,101 @@
+import io
+from binascii import crc32
+from typing import BinaryIO
+
+from dissect.util.stream import OverlayStream
+
+
+def repair_checksum(fh: BinaryIO) -> BinaryIO:
+    """Repair CRC32 checksums for all headers in an XZ stream.
+
+    FortiOS XZ files have (on purpose) corrupt streams which they read using a modified ``xz`` binary.
+    The only thing changed are the CRC32 checksums, so partially parse the XZ file and fix all of them.
+
+    References:
+        - https://tukaani.org/xz/xz-file-format-1.1.0.txt
+        - https://github.com/Rogdham/python-xz
+
+    Args:
+        fh: A file-like object of an LZMA stream to repair.
+    """
+    size = fh.seek(0, io.SEEK_END)
+    repaired = OverlayStream(fh, size)
+    fh.seek(0)
+
+    header = fh.read(12)
+    # Check header magic
+    if header[:6] != b"\xfd7zXZ\x00":
+        raise ValueError("Not an XZ file")
+
+    # Add correct header CRC32
+    repaired.add(8, _crc32(header[6:8]))
+
+    fh.seek(-12, io.SEEK_END)
+    footer = fh.read(12)
+
+    # Check footer magic
+    if footer[10:12] != b"YZ":
+        raise ValueError("Not an XZ file")
+
+    # Add correct footer CRC32
+    repaired.add(fh.tell() - 12, _crc32(footer[4:10]))
+
+    backward_size = (int.from_bytes(footer[4:8], "little") + 1) * 4
+    fh.seek(-12 - backward_size, io.SEEK_END)
+    index = fh.read(backward_size)
+
+    # Add correct index CRC32
+    repaired.add(fh.tell() - 4, _crc32(index[:-4]))
+
+    # Parse the index
+    isize, nb_records = _mbi(index[1:])
+    index = index[1 + isize : -4]
+    records = []
+    for _ in range(nb_records):
+        if not index:
+            raise ValueError("index size")
+
+        isize, unpadded_size = _mbi(index)
+        if not unpadded_size:
+            raise ValueError("index record unpadded size")
+
+        index = index[isize:]
+        if not index:
+            raise ValueError("index size")
+
+        isize, uncompressed_size = _mbi(index)
+        if not uncompressed_size:
+            raise ValueError("index record uncompressed size")
+
+        index = index[isize:]
+        records.append((unpadded_size, uncompressed_size))
+
+    block_start = size - 12 - backward_size
+    blocks_len = sum((unpadded_size + 3) & ~3 for unpadded_size, _ in records)
+    block_start -= blocks_len
+
+    # Iterate over all blocks and add the correct block header CRC32
+    for unpadded_size, _ in records:
+        fh.seek(block_start)
+
+        block_header = fh.read(1)
+        block_header_size = (block_header[0] + 1) * 4
+        block_header += fh.read(block_header_size - 1)
+        repaired.add(fh.tell() - 4, _crc32(block_header[:-4]))
+
+        block_start += (unpadded_size + 3) & ~3
+
+    return repaired
+
+
+def _mbi(data: bytes) -> tuple[int, int]:
+    value = 0
+    for size, byte in enumerate(data):
+        value |= (byte & 0x7F) << (size * 7)
+        if not byte & 0x80:
+            return size + 1, value
+    raise ValueError("Invalid mbi")
+
+
+def _crc32(data: bytes) -> bytes:
+    return int.to_bytes(crc32(data), 4, "little")
diff --git a/tests/test_compression.py b/tests/test_compression.py
index d602dbe..2480d44 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -1,4 +1,6 @@
 import hashlib
+import lzma
+from io import BytesIO
 
 from dissect.util.compression import (
     lz4,
@@ -7,6 +9,7 @@
     lzxpress,
     lzxpress_huffman,
     sevenbit,
+    xz,
 )
 
 
@@ -254,3 +257,16 @@ def test_sevenbit_decompress_wide():
     result = sevenbit.decompress(bytes.fromhex("b796384d078ddf6db8bc3c9fa7df6e10bd3ca783e67479da7d06"), wide=True)
     target = "7-bit compression test string".encode("utf-16-le")
     assert result == target
+
+
+def test_xz_repair_checksum():
+    buf = BytesIO(
+        bytes.fromhex(
+            "fd377a585a000004deadbeef0200210116000000deadbeefe00fff001e5d003a"
+            "194ace2b0f238ce989a29cfeb182a4e814985366b771770233ca314836000000"
+            "2972e8fd62b18ee300013a8020000000deadbeefdeadbeef020000000004595a"
+        )
+    )
+    repaired = xz.repair_checksum(buf)
+
+    assert lzma.decompress(repaired.read()) == b"test" * 1024

From 471c5dc5c30e4d6d50d792e50d5410d620cfdc85 Mon Sep 17 00:00:00 2001
From: Schamper <1254028+Schamper@users.noreply.github.com>
Date: Mon, 22 Jan 2024 12:41:01 +0100
Subject: [PATCH 2/6] Process comments

---
 dissect/util/compression/xz.py | 41 +++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py
index f09a7be..9c1bc44 100644
--- a/dissect/util/compression/xz.py
+++ b/dissect/util/compression/xz.py
@@ -4,6 +4,9 @@
 
 from dissect.util.stream import OverlayStream
 
+HEADER_FOOTER_SIZE = 12
+CRC_SIZE = 4
+
 
 def repair_checksum(fh: BinaryIO) -> BinaryIO:
     """Repair CRC32 checksums for all headers in an XZ stream.
@@ -22,50 +25,52 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
     repaired = OverlayStream(fh, size)
     fh.seek(0)
 
-    header = fh.read(12)
+    header = fh.read(HEADER_FOOTER_SIZE)
     # Check header magic
-    if header[:6] != b"\xfd7zXZ\x00":
+    magic = b"\xfd7zXZ\x00"
+    if header[: len(magic)] != magic:
         raise ValueError("Not an XZ file")
 
     # Add correct header CRC32
     repaired.add(8, _crc32(header[6:8]))
 
-    fh.seek(-12, io.SEEK_END)
-    footer = fh.read(12)
+    footer_offset = fh.seek(-HEADER_FOOTER_SIZE, io.SEEK_END)
+    footer = fh.read(HEADER_FOOTER_SIZE)
 
     # Check footer magic
-    if footer[10:12] != b"YZ":
+    footer_magic = b"YZ"
+    if footer[HEADER_FOOTER_SIZE - len(footer_magic) : HEADER_FOOTER_SIZE] != footer_magic:
         raise ValueError("Not an XZ file")
 
     # Add correct footer CRC32
-    repaired.add(fh.tell() - 12, _crc32(footer[4:10]))
+    repaired.add(footer_offset, _crc32(footer[CRC_SIZE : HEADER_FOOTER_SIZE - len(footer_magic)]))
 
     backward_size = (int.from_bytes(footer[4:8], "little") + 1) * 4
-    fh.seek(-12 - backward_size, io.SEEK_END)
+    fh.seek(-HEADER_FOOTER_SIZE - backward_size, io.SEEK_END)
     index = fh.read(backward_size)
 
     # Add correct index CRC32
-    repaired.add(fh.tell() - 4, _crc32(index[:-4]))
+    repaired.add(fh.tell() - CRC_SIZE, _crc32(index[:-CRC_SIZE]))
 
     # Parse the index
-    isize, nb_records = _mbi(index[1:])
+    isize, num_records = _mbi(index[1:])
     index = index[1 + isize : -4]
     records = []
-    for _ in range(nb_records):
+    for _ in range(num_records):
         if not index:
-            raise ValueError("index size")
+            raise ValueError("Missing index size")
 
         isize, unpadded_size = _mbi(index)
         if not unpadded_size:
-            raise ValueError("index record unpadded size")
+            raise ValueError("Missing index record unpadded size")
 
         index = index[isize:]
         if not index:
-            raise ValueError("index size")
+            raise ValueError("Missing index size")
 
         isize, uncompressed_size = _mbi(index)
         if not uncompressed_size:
-            raise ValueError("index record uncompressed size")
+            raise ValueError("Missing index record uncompressed size")
 
         index = index[isize:]
         records.append((unpadded_size, uncompressed_size))
@@ -81,7 +86,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
         block_header = fh.read(1)
         block_header_size = (block_header[0] + 1) * 4
         block_header += fh.read(block_header_size - 1)
-        repaired.add(fh.tell() - 4, _crc32(block_header[:-4]))
+        repaired.add(fh.tell() - CRC_SIZE, _crc32(block_header[:-4]))
 
         block_start += (unpadded_size + 3) & ~3
 
@@ -89,6 +94,12 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
 
 
 def _mbi(data: bytes) -> tuple[int, int]:
+    """Decode a multibyte integer.
+
+    The encoding is similar to most other "varint" encodings. For each byte, the 7 least significant bits are used for
+    the integer value. The most significant bit is used to indicate if the integer continues in the next byte.
+    Bytes are ordered in little endian byte order, meaning the least significant byte comes first.
+    """
     value = 0
     for size, byte in enumerate(data):
         value |= (byte & 0x7F) << (size * 7)

From c6dfc6f21e40d155046b528c8ba45762d2f3ab02 Mon Sep 17 00:00:00 2001
From: Erik Schamper <1254028+Schamper@users.noreply.github.com>
Date: Mon, 22 Jan 2024 16:02:03 +0100
Subject: [PATCH 3/6] Apply suggestions from code review

Co-authored-by: pyrco <105293448+pyrco@users.noreply.github.com>
---
 dissect/util/compression/xz.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py
index 9c1bc44..eeacd8b 100644
--- a/dissect/util/compression/xz.py
+++ b/dissect/util/compression/xz.py
@@ -21,7 +21,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
     Args:
         fh: A file-like object of an LZMA stream to repair.
     """
-    size = fh.seek(0, io.SEEK_END)
+    file_size = fh.seek(0, io.SEEK_END)
     repaired = OverlayStream(fh, size)
     fh.seek(0)
 
@@ -75,7 +75,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
         index = index[isize:]
         records.append((unpadded_size, uncompressed_size))
 
-    block_start = size - 12 - backward_size
+    block_start = file_size - HEADER_FOOTER_SIZE - backward_size
     blocks_len = sum((unpadded_size + 3) & ~3 for unpadded_size, _ in records)
     block_start -= blocks_len
 
@@ -86,7 +86,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
         block_header = fh.read(1)
         block_header_size = (block_header[0] + 1) * 4
         block_header += fh.read(block_header_size - 1)
-        repaired.add(fh.tell() - CRC_SIZE, _crc32(block_header[:-4]))
+        repaired.add(fh.tell() - CRC_SIZE, _crc32(block_header[:-CRC_SIZE]))
 
         block_start += (unpadded_size + 3) & ~3
 
@@ -109,4 +109,4 @@ def _mbi(data: bytes) -> tuple[int, int]:
 
 
 def _crc32(data: bytes) -> bytes:
-    return int.to_bytes(crc32(data), 4, "little")
+    return int.to_bytes(crc32(data), CRC_SIZE, "little")

From cad8549a1c222bd12f6e2d8431ca5fa0c13cf68c Mon Sep 17 00:00:00 2001
From: Schamper <1254028+Schamper@users.noreply.github.com>
Date: Mon, 22 Jan 2024 16:03:24 +0100
Subject: [PATCH 4/6] Update

---
 dissect/util/compression/xz.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py
index eeacd8b..d6fb049 100644
--- a/dissect/util/compression/xz.py
+++ b/dissect/util/compression/xz.py
@@ -22,7 +22,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
         fh: A file-like object of an LZMA stream to repair.
     """
     file_size = fh.seek(0, io.SEEK_END)
-    repaired = OverlayStream(fh, size)
+    repaired = OverlayStream(fh, file_size)
     fh.seek(0)
 
     header = fh.read(HEADER_FOOTER_SIZE)
@@ -32,7 +32,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
         raise ValueError("Not an XZ file")
 
     # Add correct header CRC32
-    repaired.add(8, _crc32(header[6:8]))
+    repaired.add(HEADER_FOOTER_SIZE - CRC_SIZE, _crc32(header[6:8]))
 
     footer_offset = fh.seek(-HEADER_FOOTER_SIZE, io.SEEK_END)
     footer = fh.read(HEADER_FOOTER_SIZE)

From 4e7a609c5b16f48f819b6ff4df40bb414f1047bb Mon Sep 17 00:00:00 2001
From: Schamper <1254028+Schamper@users.noreply.github.com>
Date: Mon, 22 Jan 2024 16:05:57 +0100
Subject: [PATCH 5/6] Update

---
 dissect/util/compression/xz.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py
index d6fb049..4862445 100644
--- a/dissect/util/compression/xz.py
+++ b/dissect/util/compression/xz.py
@@ -32,7 +32,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
         raise ValueError("Not an XZ file")
 
     # Add correct header CRC32
-    repaired.add(HEADER_FOOTER_SIZE - CRC_SIZE, _crc32(header[6:8]))
+    repaired.add(HEADER_FOOTER_SIZE - CRC_SIZE, _crc32(header[len(magic) : HEADER_FOOTER_SIZE - CRC_SIZE]))
 
     footer_offset = fh.seek(-HEADER_FOOTER_SIZE, io.SEEK_END)
     footer = fh.read(HEADER_FOOTER_SIZE)

From a027734e02cc75718e63a7cb7855f30a9db2fd34 Mon Sep 17 00:00:00 2001
From: Erik Schamper <1254028+Schamper@users.noreply.github.com>
Date: Mon, 22 Jan 2024 16:11:26 +0100
Subject: [PATCH 6/6] Update dissect/util/compression/xz.py

Co-authored-by: pyrco <105293448+pyrco@users.noreply.github.com>
---
 dissect/util/compression/xz.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dissect/util/compression/xz.py b/dissect/util/compression/xz.py
index 4862445..826288b 100644
--- a/dissect/util/compression/xz.py
+++ b/dissect/util/compression/xz.py
@@ -32,7 +32,7 @@ def repair_checksum(fh: BinaryIO) -> BinaryIO:
         raise ValueError("Not an XZ file")
 
     # Add correct header CRC32
-    repaired.add(HEADER_FOOTER_SIZE - CRC_SIZE, _crc32(header[len(magic) : HEADER_FOOTER_SIZE - CRC_SIZE]))
+    repaired.add(fh.tell() - CRC_SIZE, _crc32(header[len(magic) : HEADER_FOOTER_SIZE - CRC_SIZE]))
 
     footer_offset = fh.seek(-HEADER_FOOTER_SIZE, io.SEEK_END)
     footer = fh.read(HEADER_FOOTER_SIZE)