Skip to content

Commit

Permalink
Simplify test mode with small document sets (#1792)
Browse files Browse the repository at this point in the history
Modifies test mode to use original document set instead of `-1k` variant
if it has 1000 or less documents. This makes the creation of `-1k`
document set files unnecessary for small corpora.
  • Loading branch information
gbanasiak authored Jan 17, 2024
1 parent 3833d53 commit 39fa2ad
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 23 deletions.
9 changes: 7 additions & 2 deletions docs/adding_tracks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -299,12 +299,17 @@ Congratulations, you have created your first track! You can test it with ``esral
Adding support for test mode
----------------------------

You can check your track very quickly for syntax errors when you invoke Rally with ``--test-mode``. Rally postprocesses its internal track representation as follows:
You can check your track very quickly for syntax errors when you invoke Rally with ``--test-mode``.

In test mode Rally postprocesses its internal track representation as follows:

* Iteration-based tasks run at most one warmup iteration and one measurement iteration.
* Time-period-based tasks run at most for 10 seconds without warmup.

Rally also postprocesses all data file names. Instead of ``documents.json``, Rally expects ``documents-1k.json`` and assumes the file contains 1.000 documents. You need to prepare these data files though. Pick 1.000 documents for every data file in your track and store them in a file with the suffix ``-1k``. We choose the first 1.000 with ``head -n 1000 documents.json > documents-1k.json``.
In test mode Rally also post-processes all data file names:

* If ``documents.json`` has 1000 documents or fewer, Rally uses it (no modifications).
* If ``documents.json`` has more than 1000 documents, Rally assumes an additional ``documents-1k.json`` file is present and uses it. You need to prepare these additional files manually. Pick 1000 documents for every data file in your track and store them in a file with the ``-1k`` suffix. On Linux you can do it as follows: ``head -n 1000 documents.json > documents-1k.json``.

Challenges
----------
Expand Down
50 changes: 32 additions & 18 deletions esrally/track/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,30 +965,44 @@ def on_after_load_track(self, track):
return track
self.logger.info("Preparing track [%s] for test mode.", str(track))
for corpus in track.corpora:
if self.logger.isEnabledFor(logging.DEBUG):
self.logger.debug("Reducing corpus size to 1000 documents for [%s]", corpus.name)
for document_set in corpus.documents:
# TODO #341: Should we allow this for snapshots too?
if document_set.is_bulk:
document_set.number_of_documents = 1000
if document_set.number_of_documents > 1000:
if self.logger.isEnabledFor(logging.DEBUG):
self.logger.debug(
"Reducing corpus size to 1000 documents in corpus [%s], uncompressed source file [%s]",
corpus.name,
document_set.document_file,
)

if document_set.has_compressed_corpus():
path, ext = io.splitext(document_set.document_archive)
path_2, ext_2 = io.splitext(path)
document_set.number_of_documents = 1000

document_set.document_archive = f"{path_2}-1k{ext_2}{ext}"
document_set.document_file = f"{path_2}-1k{ext_2}"
elif document_set.has_uncompressed_corpus():
path, ext = io.splitext(document_set.document_file)
document_set.document_file = f"{path}-1k{ext}"
else:
raise exceptions.RallyAssertionError(
f"Document corpus [{corpus.name}] has neither compressed nor uncompressed corpus."
)
if document_set.has_compressed_corpus():
path, ext = io.splitext(document_set.document_archive)
path_2, ext_2 = io.splitext(path)

# we don't want to check sizes
document_set.compressed_size_in_bytes = None
document_set.uncompressed_size_in_bytes = None
document_set.document_archive = f"{path_2}-1k{ext_2}{ext}"
document_set.document_file = f"{path_2}-1k{ext_2}"
elif document_set.has_uncompressed_corpus():
path, ext = io.splitext(document_set.document_file)
document_set.document_file = f"{path}-1k{ext}"
else:
raise exceptions.RallyAssertionError(
f"Document corpus [{corpus.name}] has neither compressed nor uncompressed corpus."
)

# we don't want to check sizes
document_set.compressed_size_in_bytes = None
document_set.uncompressed_size_in_bytes = None
else:
if self.logger.isEnabledFor(logging.DEBUG):
self.logger.debug(
"Maintaining existing size of %d documents in corpus [%s], uncompressed source file [%s]",
document_set.number_of_documents,
corpus.name,
document_set.document_file,
)

for challenge in track.challenges:
for task in challenge.schedule:
Expand Down
28 changes: 25 additions & 3 deletions tests/track/loader_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,7 +1259,18 @@ def test_post_processes_track_spec(self):
],
"corpora": [
{
"name": "unittest",
"name": "unittest-reduce-to-1k-documents",
"documents": [
{
"source-file": "documents.json.bz2",
"document-count": 1001,
"compressed-bytes": 100,
"uncompressed-bytes": 10000,
}
],
},
{
"name": "unittest-keep-less-than-1k-documents",
"documents": [
{
"source-file": "documents.json.bz2",
Expand All @@ -1268,7 +1279,7 @@ def test_post_processes_track_spec(self):
"uncompressed-bytes": 10000,
}
],
}
},
],
"operations": [
{
Expand Down Expand Up @@ -1335,11 +1346,22 @@ def test_post_processes_track_spec(self):
],
"corpora": [
{
"name": "unittest",
"name": "unittest-reduce-to-1k-documents",
"documents": [
{"source-file": "documents-1k.json.bz2", "document-count": 1000},
],
},
{
"name": "unittest-keep-less-than-1k-documents",
"documents": [
{
"source-file": "documents.json.bz2",
"document-count": 10,
"compressed-bytes": 100,
"uncompressed-bytes": 10000,
},
],
},
],
"operations": [
{
Expand Down

0 comments on commit 39fa2ad

Please sign in to comment.