Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding zstd compression and decompression support for workload corpora #410

Merged
merged 2 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion osbenchmark/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import tarfile
import zipfile
from contextlib import suppress
import zstandard as zstd

import mmap

Expand Down Expand Up @@ -249,7 +250,7 @@ def is_archive(name):
:return: True iff the given file name is an archive that is also recognized for decompression by Benchmark.
"""
_, ext = splitext(name)
return ext in [".zip", ".bz2", ".gz", ".tar", ".tar.gz", ".tgz", ".tar.bz2"]
return ext in [".zip", ".bz2", ".gz", ".tar", ".tar.gz", ".tgz", ".tar.bz2", ".zst"]


def is_executable(name):
Expand All @@ -272,6 +273,28 @@ def compress(source_directory, archive_name):
_zipdir(source_directory, archive)


def compress_zstd(source_directory, archive_name):
"""
Compress a directory tree using Zstandard compression.
:param source_directory: The source directory to compress. Must be readable.
:param archive_name: The absolute path including the file name of the archive. Must have the extension .zst.
"""
zstc = zstd.ZstdCompressor()

with open(archive_name, "wb") as archive_file:
with zstc.stream_writer(archive_file) as compressor:
for root, _, files in os.walk(source_directory):
for file in files:
file_path = os.path.join(root, file)
rel_path = os.path.relpath(file_path, source_directory)
# Write the file path (relative) to the archive to recreate the directory structure
compressor.write(rel_path.encode("utf-8"))
with open(file_path, "rb") as source_file:
# Write the content of the file to the archive
for chunk in source_file:
compressor.write(chunk)


def decompress(zip_name, target_directory):
"""
Decompresses the provided archive to the target directory. The following file extensions are supported:
Expand All @@ -283,6 +306,7 @@ def decompress(zip_name, target_directory):
* tar.gz
* tgz
* tar.bz2
* zst

The decompression method is chosen based on the file extension.

Expand All @@ -303,6 +327,8 @@ def decompress(zip_name, target_directory):
_do_decompress_manually(target_directory, zip_name, decompressor_args, decompressor_lib)
elif extension in [".tar", ".tar.gz", ".tgz", ".tar.bz2"]:
_do_decompress(target_directory, tarfile.open(zip_name))
elif extension == ".zst":
_do_decompress_zstd(target_directory, zip_name)
else:
raise RuntimeError("Unsupported file extension [%s]. Cannot decompress [%s]" % (extension, zip_name))

Expand Down Expand Up @@ -344,6 +370,18 @@ def _do_decompress_manually_with_lib(target_directory, filename, compressed_file
compressed_file.close()


def _do_decompress_zstd(target_directory, filename):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of creating a separate function for this, is it possible to implement this through the _do_decompress_manually as is done with other decompressors like bz2? This would remove boilerplate code

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I'll be working on that, thanks

path_without_extension = os.path.splitext(os.path.basename(filename))[0]
try:
with open(filename, 'rb') as compressed_file:
zstd_decompressor = zstd.ZstdDecompressor()
with open(os.path.join(target_directory, path_without_extension), "wb") as new_file:
for chunk in zstd_decompressor.read_to_iter(compressed_file.read):
new_file.write(chunk)
except Exception as e:
logging.getLogger(__name__).warning("Failed to decompress [%s] with Zstandard. Error: %s.", filename, str(e))


def _do_decompress(target_directory, compressed_file):
try:
compressed_file.extractall(path=target_directory)
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ def str_from_file(name):
# jmespath: MIT
# s3transfer: Apache 2.0
"boto3==1.28.62",
# Licence: BSD-3-Clause
"zstandard==0.22.0",
]

tests_require = [
Expand Down
4 changes: 2 additions & 2 deletions tests/utils/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_has_extension(self):

class TestDecompression:
def test_decompresses_supported_file_formats(self):
for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz"]:
for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz", "zst"]:
tmp_dir = tempfile.mkdtemp()
archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt.{ext}")
decompressed_path = os.path.join(tmp_dir, "test.txt")
Expand All @@ -90,7 +90,7 @@ def test_decompresses_supported_file_formats(self):

@mock.patch.object(io, "is_executable", return_value=False)
def test_decompresses_supported_file_formats_with_lib_as_failover(self, mocked_is_executable):
for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz"]:
for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz", "zst"]:
tmp_dir = tempfile.mkdtemp()
archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt.{ext}")
decompressed_path = os.path.join(tmp_dir, "test.txt")
Expand Down
Loading