From 39fa2ad546a95d540b846147346b91daac6a8091 Mon Sep 17 00:00:00 2001 From: Grzegorz Banasiak Date: Wed, 17 Jan 2024 11:40:31 +0100 Subject: [PATCH] Simplify test mode with small document sets (#1792) Modifies test mode to use original document set instead of `-1k` variant if it has 1000 or less documents. This makes the creation of `-1k` document set files unnecessary for small corpora. --- docs/adding_tracks.rst | 9 +++++-- esrally/track/loader.py | 50 ++++++++++++++++++++++++-------------- tests/track/loader_test.py | 28 ++++++++++++++++++--- 3 files changed, 64 insertions(+), 23 deletions(-) diff --git a/docs/adding_tracks.rst b/docs/adding_tracks.rst index 98d885349..6c23c8f2e 100644 --- a/docs/adding_tracks.rst +++ b/docs/adding_tracks.rst @@ -299,12 +299,17 @@ Congratulations, you have created your first track! You can test it with ``esral Adding support for test mode ---------------------------- -You can check your track very quickly for syntax errors when you invoke Rally with ``--test-mode``. Rally postprocesses its internal track representation as follows: +You can check your track very quickly for syntax errors when you invoke Rally with ``--test-mode``. + +In test mode Rally postprocesses its internal track representation as follows: * Iteration-based tasks run at most one warmup iteration and one measurement iteration. * Time-period-based tasks run at most for 10 seconds without warmup. -Rally also postprocesses all data file names. Instead of ``documents.json``, Rally expects ``documents-1k.json`` and assumes the file contains 1.000 documents. You need to prepare these data files though. Pick 1.000 documents for every data file in your track and store them in a file with the suffix ``-1k``. We choose the first 1.000 with ``head -n 1000 documents.json > documents-1k.json``. +In test mode Rally also post-processes all data file names: + +* If ``documents.json`` has 1000 documents or fewer, Rally uses it (no modifications). +* If ``documents.json`` has more than 1000 documents, Rally assumes an additional ``documents-1k.json`` file is present and uses it. You need to prepare these additional files manually. Pick 1000 documents for every data file in your track and store them in a file with the ``-1k`` suffix. On Linux you can do it as follows: ``head -n 1000 documents.json > documents-1k.json``. Challenges ---------- diff --git a/esrally/track/loader.py b/esrally/track/loader.py index 8da4ffb7b..56a299d13 100644 --- a/esrally/track/loader.py +++ b/esrally/track/loader.py @@ -965,30 +965,44 @@ def on_after_load_track(self, track): return track self.logger.info("Preparing track [%s] for test mode.", str(track)) for corpus in track.corpora: - if self.logger.isEnabledFor(logging.DEBUG): - self.logger.debug("Reducing corpus size to 1000 documents for [%s]", corpus.name) for document_set in corpus.documents: # TODO #341: Should we allow this for snapshots too? if document_set.is_bulk: - document_set.number_of_documents = 1000 + if document_set.number_of_documents > 1000: + if self.logger.isEnabledFor(logging.DEBUG): + self.logger.debug( + "Reducing corpus size to 1000 documents in corpus [%s], uncompressed source file [%s]", + corpus.name, + document_set.document_file, + ) - if document_set.has_compressed_corpus(): - path, ext = io.splitext(document_set.document_archive) - path_2, ext_2 = io.splitext(path) + document_set.number_of_documents = 1000 - document_set.document_archive = f"{path_2}-1k{ext_2}{ext}" - document_set.document_file = f"{path_2}-1k{ext_2}" - elif document_set.has_uncompressed_corpus(): - path, ext = io.splitext(document_set.document_file) - document_set.document_file = f"{path}-1k{ext}" - else: - raise exceptions.RallyAssertionError( - f"Document corpus [{corpus.name}] has neither compressed nor uncompressed corpus." - ) + if document_set.has_compressed_corpus(): + path, ext = io.splitext(document_set.document_archive) + path_2, ext_2 = io.splitext(path) - # we don't want to check sizes - document_set.compressed_size_in_bytes = None - document_set.uncompressed_size_in_bytes = None + document_set.document_archive = f"{path_2}-1k{ext_2}{ext}" + document_set.document_file = f"{path_2}-1k{ext_2}" + elif document_set.has_uncompressed_corpus(): + path, ext = io.splitext(document_set.document_file) + document_set.document_file = f"{path}-1k{ext}" + else: + raise exceptions.RallyAssertionError( + f"Document corpus [{corpus.name}] has neither compressed nor uncompressed corpus." + ) + + # we don't want to check sizes + document_set.compressed_size_in_bytes = None + document_set.uncompressed_size_in_bytes = None + else: + if self.logger.isEnabledFor(logging.DEBUG): + self.logger.debug( + "Maintaining existing size of %d documents in corpus [%s], uncompressed source file [%s]", + document_set.number_of_documents, + corpus.name, + document_set.document_file, + ) for challenge in track.challenges: for task in challenge.schedule: diff --git a/tests/track/loader_test.py b/tests/track/loader_test.py index fe01de437..20b9f64a3 100644 --- a/tests/track/loader_test.py +++ b/tests/track/loader_test.py @@ -1259,7 +1259,18 @@ def test_post_processes_track_spec(self): ], "corpora": [ { - "name": "unittest", + "name": "unittest-reduce-to-1k-documents", + "documents": [ + { + "source-file": "documents.json.bz2", + "document-count": 1001, + "compressed-bytes": 100, + "uncompressed-bytes": 10000, + } + ], + }, + { + "name": "unittest-keep-less-than-1k-documents", "documents": [ { "source-file": "documents.json.bz2", @@ -1268,7 +1279,7 @@ def test_post_processes_track_spec(self): "uncompressed-bytes": 10000, } ], - } + }, ], "operations": [ { @@ -1335,11 +1346,22 @@ def test_post_processes_track_spec(self): ], "corpora": [ { - "name": "unittest", + "name": "unittest-reduce-to-1k-documents", "documents": [ {"source-file": "documents-1k.json.bz2", "document-count": 1000}, ], }, + { + "name": "unittest-keep-less-than-1k-documents", + "documents": [ + { + "source-file": "documents.json.bz2", + "document-count": 10, + "compressed-bytes": 100, + "uncompressed-bytes": 10000, + }, + ], + }, ], "operations": [ {