diff --git a/create-notice.sh b/create-notice.sh index 8b0df3b46..b74ec17f3 100755 --- a/create-notice.sh +++ b/create-notice.sh @@ -54,6 +54,7 @@ function main { add_license "ijson" "https://raw.githubusercontent.com/ICRAR/ijson/master/LICENSE.txt" add_license "google-resumable-media" "https://raw.githubusercontent.com/googleapis/google-resumable-media-python/main/LICENSE" add_license "google-auth" "https://raw.githubusercontent.com/googleapis/google-auth-library-python/main/LICENSE" + add_license "zstandard" "https://github.com/indygreg/python-zstandard/blob/main/LICENSE" # transitive dependencies # Jinja2 dependencies diff --git a/docs/install.rst b/docs/install.rst index fe4d79eb5..b79da408a 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -68,11 +68,12 @@ In all other cases, Rally requires ``git 1.9`` or better. Verify with ``git --ve ``git`` is already installed on macOS. -pbzip2 -~~~~~~ +pbzip2, pigz, zstd +~~~~~~~~~~~~~~~~~~ It is strongly recommended to install ``pbzip2`` to speed up decompressing the corpora of Rally `standard tracks `_. If you have created :doc:`custom tracks ` using corpora compressed with ``gzip`` instead of ``bzip2``, it's also advisable to install ``pigz`` to speed up the process. +Rally also supports ``zst`` compressed corpora out of the box, installing ``zstd`` speeds up the process. **Debian / Ubuntu** diff --git a/docs/track.rst b/docs/track.rst index ef307fee9..49e631093 100644 --- a/docs/track.rst +++ b/docs/track.rst @@ -343,7 +343,7 @@ Each entry in the ``documents`` list consists of the following properties: * S3 according to `docs `_. * Google Storage: Either using `client library authentication `_ or by presenting an `oauth2 token `_ via the ``GOOGLE_AUTH_TOKEN`` environment variable, typically done using: ``export GOOGLE_AUTH_TOKEN=$(gcloud auth print-access-token)``. * ``source-format`` (optional, default: ``bulk``): Defines in which format Rally should interpret the data file specified by ``source-file``. Currently, only ``bulk`` is supported. -* ``source-file`` (mandatory): File name of the corresponding documents. For local use, this file can be a ``.json`` file. If you provide a ``base-url`` we recommend that you provide a compressed file here. The following extensions are supported: ``.zip``, ``.bz2``, ``.gz``, ``.tar``, ``.tar.gz``, ``.tgz`` or ``.tar.bz2``. It must contain exactly one JSON file with the same name. The preferred file extension for our official tracks is ``.bz2``. +* ``source-file`` (mandatory): File name of the corresponding documents. For local use, this file can be a ``.json`` file. If you provide a ``base-url`` we recommend that you provide a compressed file here. The following extensions are supported: ``.zip``, ``.bz2``, ``.gz``, ``.tar``, ``.tar.gz``, ``.tgz``, ``.tar.bz2`` or ``zst``. It must contain exactly one JSON file with the same name. The preferred file extension for our official tracks is ``.bz2``. * ``includes-action-and-meta-data`` (optional, defaults to ``false``): Defines whether the documents file contains already an `action and meta-data `_ line (``true``) or only documents (``false``). .. note:: diff --git a/esrally/utils/io.py b/esrally/utils/io.py index be8350217..7452e2b71 100644 --- a/esrally/utils/io.py +++ b/esrally/utils/io.py @@ -27,8 +27,12 @@ import tarfile import zipfile +import zstandard + from esrally.utils import console +SUPPORTED_ARCHIVE_FORMATS = [".zip", ".bz2", ".gz", ".tar", ".tar.gz", ".tgz", ".tar.bz2", ".zst"] + class FileSource: """ @@ -218,6 +222,24 @@ def __str__(self, *args, **kwargs): return "StringAsFileSource" +class ZstAdapter: + """ + Adapter class to make the zstandard API work with Rally's decompression abstractions + """ + + def __init__(self, path): + self.fh = open(path, "rb") + self.dctx = zstandard.ZstdDecompressor() + self.reader = self.dctx.stream_reader(self.fh) + + def read(self, size): + return self.reader.read(size) + + def close(self): + self.reader.close() + self.fh.close() + + def ensure_dir(directory, mode=0o777): """ Ensure that the provided directory and all of its parent directories exist. @@ -245,7 +267,7 @@ def is_archive(name): :return: True iff the given file name is an archive that is also recognized for decompression by Rally. """ _, ext = splitext(name) - return ext in [".zip", ".bz2", ".gz", ".tar", ".tar.gz", ".tgz", ".tar.bz2"] + return ext in SUPPORTED_ARCHIVE_FORMATS def is_executable(name): @@ -279,6 +301,7 @@ def decompress(zip_name, target_directory): * tar.gz * tgz * tar.bz2 + * zst The decompression method is chosen based on the file extension. @@ -293,6 +316,10 @@ def decompress(zip_name, target_directory): decompressor_args = ["pbzip2", "-d", "-k", "-m10000", "-c"] decompressor_lib = bz2.open _do_decompress_manually(target_directory, zip_name, decompressor_args, decompressor_lib) + elif extension == ".zst": + decompressor_args = ["pzstd", "-f", "-d", "-c"] + decompressor_lib = ZstAdapter + _do_decompress_manually(target_directory, zip_name, decompressor_args, decompressor_lib) elif extension == ".gz": decompressor_args = ["pigz", "-d", "-k", "-c"] decompressor_lib = gzip.open diff --git a/pyproject.toml b/pyproject.toml index 9a277d933..ab6f91915 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,8 @@ dependencies = [ "google-resumable-media[requests]==1.1.0", # License: Apache 2.0 "google-auth==1.22.1", + # License: BSD + "zstandard==0.21.0" ] [project.optional-dependencies] diff --git a/tests/utils/io_test.py b/tests/utils/io_test.py index 91879f61f..e6a48f8ef 100644 --- a/tests/utils/io_test.py +++ b/tests/utils/io_test.py @@ -53,6 +53,7 @@ def test_normalize_path(self): def test_archive(self): assert io.is_archive("/tmp/some-archive.tar.gz") assert io.is_archive("/tmp/some-archive.tgz") + assert io.is_archive("/tmp/some-archive.zst") # Rally does not recognize .7z assert not io.is_archive("/tmp/some-archive.7z") assert not io.is_archive("/tmp/some.log") @@ -68,9 +69,9 @@ def test_has_extension(self): class TestDecompression: def test_decompresses_supported_file_formats(self): - for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz"]: + for ext in io.SUPPORTED_ARCHIVE_FORMATS: tmp_dir = tempfile.mkdtemp() - archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt.{ext}") + archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt{ext}") decompressed_path = os.path.join(tmp_dir, "test.txt") io.decompress(archive_path, target_directory=tmp_dir) @@ -84,9 +85,9 @@ def test_decompresses_supported_file_formats(self): @mock.patch.object(io, "is_executable", return_value=False) def test_decompresses_supported_file_formats_with_lib_as_failover(self, mocked_is_executable): - for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz"]: + for ext in io.SUPPORTED_ARCHIVE_FORMATS: tmp_dir = tempfile.mkdtemp() - archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt.{ext}") + archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt{ext}") decompressed_path = os.path.join(tmp_dir, "test.txt") logger = logging.getLogger("esrally.utils.io") @@ -100,7 +101,7 @@ def test_decompresses_supported_file_formats_with_lib_as_failover(self, mocked_i self.read(decompressed_path) == "Sample text for DecompressionTests\n" ), f"Could not decompress [{archive_path}] to [{decompressed_path}] (target file is corrupt)" - if ext in ["bz2", "gz"]: + if ext in ["bz2", "gz", "zst"]: assert "not found in PATH. Using standard library, decompression will take longer." in mocked_console_warn.call_args[0][0] @mock.patch("subprocess.run") diff --git a/tests/utils/resources/test.txt.zst b/tests/utils/resources/test.txt.zst new file mode 100644 index 000000000..40900f41c Binary files /dev/null and b/tests/utils/resources/test.txt.zst differ