Skip to content

Commit

Permalink
Add support for zst-compression (#1786)
Browse files Browse the repository at this point in the history
With this commit we add support for zstd compressed corpora. Compared to
bzip, the zstd format produces compressed files that are roughly 40%
smaller and took around a third of the time to decompress in our tests.

Closes #1781
  • Loading branch information
danielmitterdorfer authored Sep 27, 2023
1 parent ba701e5 commit a230317
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 9 deletions.
1 change: 1 addition & 0 deletions create-notice.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ function main {
add_license "ijson" "https://raw.githubusercontent.com/ICRAR/ijson/master/LICENSE.txt"
add_license "google-resumable-media" "https://raw.githubusercontent.com/googleapis/google-resumable-media-python/main/LICENSE"
add_license "google-auth" "https://raw.githubusercontent.com/googleapis/google-auth-library-python/main/LICENSE"
add_license "zstandard" "https://github.com/indygreg/python-zstandard/blob/main/LICENSE"

# transitive dependencies
# Jinja2 dependencies
Expand Down
5 changes: 3 additions & 2 deletions docs/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ In all other cases, Rally requires ``git 1.9`` or better. Verify with ``git --ve

``git`` is already installed on macOS.

pbzip2
~~~~~~
pbzip2, pigz, zstd
~~~~~~~~~~~~~~~~~~

It is strongly recommended to install ``pbzip2`` to speed up decompressing the corpora of Rally `standard tracks <https://github.com/elastic/rally-tracks>`_.
If you have created :doc:`custom tracks </adding_tracks>` using corpora compressed with ``gzip`` instead of ``bzip2``, it's also advisable to install ``pigz`` to speed up the process.
Rally also supports ``zst`` compressed corpora out of the box, installing ``zstd`` speeds up the process.

**Debian / Ubuntu**

Expand Down
2 changes: 1 addition & 1 deletion docs/track.rst
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ Each entry in the ``documents`` list consists of the following properties:
* S3 according to `docs <https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html#configuration>`_.
* Google Storage: Either using `client library authentication <https://cloud.google.com/storage/docs/reference/libraries#setting_up_authentication>`_ or by presenting an `oauth2 token <https://cloud.google.com/storage/docs/authentication>`_ via the ``GOOGLE_AUTH_TOKEN`` environment variable, typically done using: ``export GOOGLE_AUTH_TOKEN=$(gcloud auth print-access-token)``.
* ``source-format`` (optional, default: ``bulk``): Defines in which format Rally should interpret the data file specified by ``source-file``. Currently, only ``bulk`` is supported.
* ``source-file`` (mandatory): File name of the corresponding documents. For local use, this file can be a ``.json`` file. If you provide a ``base-url`` we recommend that you provide a compressed file here. The following extensions are supported: ``.zip``, ``.bz2``, ``.gz``, ``.tar``, ``.tar.gz``, ``.tgz`` or ``.tar.bz2``. It must contain exactly one JSON file with the same name. The preferred file extension for our official tracks is ``.bz2``.
* ``source-file`` (mandatory): File name of the corresponding documents. For local use, this file can be a ``.json`` file. If you provide a ``base-url`` we recommend that you provide a compressed file here. The following extensions are supported: ``.zip``, ``.bz2``, ``.gz``, ``.tar``, ``.tar.gz``, ``.tgz``, ``.tar.bz2`` or ``zst``. It must contain exactly one JSON file with the same name. The preferred file extension for our official tracks is ``.bz2``.
* ``includes-action-and-meta-data`` (optional, defaults to ``false``): Defines whether the documents file contains already an `action and meta-data <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html#docs-bulk-api-desc>`_ line (``true``) or only documents (``false``).

.. note::
Expand Down
29 changes: 28 additions & 1 deletion esrally/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,12 @@
import tarfile
import zipfile

import zstandard

from esrally.utils import console

SUPPORTED_ARCHIVE_FORMATS = [".zip", ".bz2", ".gz", ".tar", ".tar.gz", ".tgz", ".tar.bz2", ".zst"]


class FileSource:
"""
Expand Down Expand Up @@ -218,6 +222,24 @@ def __str__(self, *args, **kwargs):
return "StringAsFileSource"


class ZstAdapter:
"""
Adapter class to make the zstandard API work with Rally's decompression abstractions
"""

def __init__(self, path):
self.fh = open(path, "rb")
self.dctx = zstandard.ZstdDecompressor()
self.reader = self.dctx.stream_reader(self.fh)

def read(self, size):
return self.reader.read(size)

def close(self):
self.reader.close()
self.fh.close()


def ensure_dir(directory, mode=0o777):
"""
Ensure that the provided directory and all of its parent directories exist.
Expand Down Expand Up @@ -245,7 +267,7 @@ def is_archive(name):
:return: True iff the given file name is an archive that is also recognized for decompression by Rally.
"""
_, ext = splitext(name)
return ext in [".zip", ".bz2", ".gz", ".tar", ".tar.gz", ".tgz", ".tar.bz2"]
return ext in SUPPORTED_ARCHIVE_FORMATS


def is_executable(name):
Expand Down Expand Up @@ -279,6 +301,7 @@ def decompress(zip_name, target_directory):
* tar.gz
* tgz
* tar.bz2
* zst
The decompression method is chosen based on the file extension.
Expand All @@ -293,6 +316,10 @@ def decompress(zip_name, target_directory):
decompressor_args = ["pbzip2", "-d", "-k", "-m10000", "-c"]
decompressor_lib = bz2.open
_do_decompress_manually(target_directory, zip_name, decompressor_args, decompressor_lib)
elif extension == ".zst":
decompressor_args = ["pzstd", "-f", "-d", "-c"]
decompressor_lib = ZstAdapter
_do_decompress_manually(target_directory, zip_name, decompressor_args, decompressor_lib)
elif extension == ".gz":
decompressor_args = ["pigz", "-d", "-k", "-c"]
decompressor_lib = gzip.open
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ dependencies = [
"google-resumable-media[requests]==1.1.0",
# License: Apache 2.0
"google-auth==1.22.1",
# License: BSD
"zstandard==0.21.0"
]

[project.optional-dependencies]
Expand Down
11 changes: 6 additions & 5 deletions tests/utils/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def test_normalize_path(self):
def test_archive(self):
assert io.is_archive("/tmp/some-archive.tar.gz")
assert io.is_archive("/tmp/some-archive.tgz")
assert io.is_archive("/tmp/some-archive.zst")
# Rally does not recognize .7z
assert not io.is_archive("/tmp/some-archive.7z")
assert not io.is_archive("/tmp/some.log")
Expand All @@ -68,9 +69,9 @@ def test_has_extension(self):

class TestDecompression:
def test_decompresses_supported_file_formats(self):
for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz"]:
for ext in io.SUPPORTED_ARCHIVE_FORMATS:
tmp_dir = tempfile.mkdtemp()
archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt.{ext}")
archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt{ext}")
decompressed_path = os.path.join(tmp_dir, "test.txt")

io.decompress(archive_path, target_directory=tmp_dir)
Expand All @@ -84,9 +85,9 @@ def test_decompresses_supported_file_formats(self):

@mock.patch.object(io, "is_executable", return_value=False)
def test_decompresses_supported_file_formats_with_lib_as_failover(self, mocked_is_executable):
for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz"]:
for ext in io.SUPPORTED_ARCHIVE_FORMATS:
tmp_dir = tempfile.mkdtemp()
archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt.{ext}")
archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt{ext}")
decompressed_path = os.path.join(tmp_dir, "test.txt")

logger = logging.getLogger("esrally.utils.io")
Expand All @@ -100,7 +101,7 @@ def test_decompresses_supported_file_formats_with_lib_as_failover(self, mocked_i
self.read(decompressed_path) == "Sample text for DecompressionTests\n"
), f"Could not decompress [{archive_path}] to [{decompressed_path}] (target file is corrupt)"

if ext in ["bz2", "gz"]:
if ext in ["bz2", "gz", "zst"]:
assert "not found in PATH. Using standard library, decompression will take longer." in mocked_console_warn.call_args[0][0]

@mock.patch("subprocess.run")
Expand Down
Binary file added tests/utils/resources/test.txt.zst
Binary file not shown.

0 comments on commit a230317

Please sign in to comment.