From 695f4fae29e98962a8e76274542832f00a3f92cf Mon Sep 17 00:00:00 2001 From: Govind Kamat Date: Mon, 24 Apr 2023 11:27:37 -0700 Subject: [PATCH] Added a feature to increase the size of the data corpus for the http_logs workload. Signed-off-by: Govind Kamat --- http_logs/README.md | 16 +++++++++ http_logs/workload.json | 77 +++++++++++++++++++++++------------------ 2 files changed, 60 insertions(+), 33 deletions(-) diff --git a/http_logs/README.md b/http_logs/README.md index 91344de6..ffb15147 100644 --- a/http_logs/README.md +++ b/http_logs/README.md @@ -49,6 +49,22 @@ node pipeline to run. Valid options are `'baseline'` (default), `'grok'` and `' * `target_throughput` (default: default values for each operation): Number of requests per second, `none` for no limit. * `search_clients`: Number of clients that issues search requests. + +### Beta Feature: Increasing the size of the data corpus + +This workload provides for a feature to use a generated data corpus in lieu of the provided corpora files (which currently total ~31 GB.) The generated corpus could, for instance, be 100 GB or more. For more details on generating such a corpus, run the following command: + +``` + expand-data-corpus.py -h +``` + +Once a corpus has been generated, it can be used for a test by supplying the following parameter via `--workoad-params`: + +* `generated_corpus:t`: Use the generated data corpus instead of the corpora files packaged with this track + +If there are multiple generated corpora files, they are all used concurrently. Ingestion of the generated and the default corpora are mutually exclusive in any single OSB run. Once ingested, however, queries packaged with this workload will operate on the entire loaded data set. + + ### License Original license text: diff --git a/http_logs/workload.json b/http_logs/workload.json index 30dcfe78..96b7c234 100644 --- a/http_logs/workload.json +++ b/http_logs/workload.json @@ -8,41 +8,52 @@ "description": "HTTP server log data", "#TODO": "Replace index definitions with a template after setting the workload version to 2. Explicit index definitions are not necessary anymore.", "indices": [ - { - "name": "logs-181998", - "body": "{{ index_body }}" - }, - { - "name": "logs-191998", - "body": "{{ index_body }}" - }, - { - "name": "logs-201998", - "body": "{{ index_body }}" - }, - { - "name": "logs-211998", - "body": "{{ index_body }}" - }, - { - "name": "logs-221998", - "body": "{{ index_body }}" - }, - { - "name": "logs-231998", - "body": "{{ index_body }}" - }, - { - "name": "logs-241998", - "body": "{{ index_body }}" - }, - { - "name": "reindexed-logs", - "body": "{{ index_body }}" - } + {%- if generated_corpus is defined %} + {{ benchmark.collect(parts="gen-idx-*.json") }} + {%- else %} + { + "name": "logs-181998", + "body": "{{ index_body }}" + }, + { + "name": "logs-191998", + "body": "{{ index_body }}" + }, + { + "name": "logs-201998", + "body": "{{ index_body }}" + }, + { + "name": "logs-211998", + "body": "{{ index_body }}" + }, + { + "name": "logs-221998", + "body": "{{ index_body }}" + }, + { + "name": "logs-231998", + "body": "{{ index_body }}" + }, + { + "name": "logs-241998", + "body": "{{ index_body }}" + }, + { + "name": "reindexed-logs", + "body": "{{ index_body }}" + } + {%- endif %} ], "corpora": [ - {%- if ingest_pipeline is defined and ingest_pipeline == "grok" %} + {%- if generated_corpus is defined %} + { + "name": "http_logs", + "documents": [ + {{ benchmark.collect(parts="gen-docs-*.json") }} + ] + } + {%- elif ingest_pipeline is defined and ingest_pipeline == "grok" %} { "name": "http_logs_unparsed", "base-url": "https://opensearch-benchmark-workloads.s3.amazonaws.com/corpora/http_logs",