diff --git a/soperator/.gitignore b/soperator/.gitignore index e79eb231..a2de300f 100644 --- a/soperator/.gitignore +++ b/soperator/.gitignore @@ -1 +1,2 @@ .terraform* +installations/alexkim \ No newline at end of file diff --git a/soperator/README.md b/soperator/README.md index d4245ebf..4f49e3bb 100644 --- a/soperator/README.md +++ b/soperator/README.md @@ -19,7 +19,6 @@ Before starting, ensure you have these tools installed: - [Terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli) - [Nebius CLI ](https://nebius.com/docs/cli/quickstart) - [kubectl](https://kubernetes.io/docs/tasks/tools/) -- AWS CLI: `python -m pip install awscli` - [jq](https://jqlang.github.io/jq/download/) - coreutils: - macOS: `brew install coreutils` @@ -31,18 +30,17 @@ Before starting, ensure you have these tools installed: 1. **Create Your Installation Directory** ```bash -mkdir -p installations/ -cd installations/ +export INSTALLATION_NAME= # e.g. customer name +mkdir -p installations/$INSTALLATION_NAME +cd installations/$INSTALLATION_NAME cp -r ../example/ ./ ``` 2. **Set Up Your Environment** -```bash -# Set your Nebius project details -export NEBIUS_TENANT_ID='' -export NEBIUS_PROJECT_ID='' -# Load environment variables +Set your NEBIUS_TENANT_ID and NEBIUS_PROJECT_ID in the `.envrc` file, then run: + +```bash source .envrc ``` @@ -166,4 +164,4 @@ tail -f outputs/nccl.out # Container test sbatch enroot.sh tail -f outputs/enroot.out -``` \ No newline at end of file +``` diff --git a/soperator/VERSION b/soperator/VERSION index d3243490..15b989e3 100644 --- a/soperator/VERSION +++ b/soperator/VERSION @@ -1 +1 @@ -1.15.5 +1.16.0 diff --git a/soperator/installations/example/.envrc b/soperator/installations/example/.envrc index f78439fa..a5e78248 100644 --- a/soperator/installations/example/.envrc +++ b/soperator/installations/example/.envrc @@ -1,11 +1,14 @@ +NEBIUS_TENANT_ID='tenant-...' +NEBIUS_PROJECT_ID='project-...' + if [ -z "${NEBIUS_TENANT_ID}" ]; then - echo "Error: NEBIUS_TENANT_ID is not set" - return 1 + echo "Error: NEBIUS_TENANT_ID is not set" + return 1 fi if [ -z "${NEBIUS_PROJECT_ID}" ]; then - echo "Error: NEBIUS_PROJECT_ID is not set" - return 1 + echo "Error: NEBIUS_PROJECT_ID is not set" + return 1 fi # Separate declaration and assignment for IAM token @@ -14,9 +17,6 @@ nebius iam whoami > /dev/null nebius iam get-access-token > /dev/null NEBIUS_IAM_TOKEN=$(nebius iam get-access-token) export NEBIUS_IAM_TOKEN -export TF_VAR_iam_token="${NEBIUS_IAM_TOKEN}" -export TF_VAR_iam_tenant_id="${NEBIUS_TENANT_ID}" -export TF_VAR_iam_project_id="${NEBIUS_PROJECT_ID}" # Separate declaration and assignment for VPC subnet NEBIUS_VPC_SUBNET_ID=$(nebius vpc subnet list \ @@ -24,14 +24,19 @@ NEBIUS_VPC_SUBNET_ID=$(nebius vpc subnet list \ --format json \ | jq -r '.items[0].metadata.id') export NEBIUS_VPC_SUBNET_ID + +# Export Nebius Cloud metadata to Terraform variables +export TF_VAR_iam_token="${NEBIUS_IAM_TOKEN}" +export TF_VAR_iam_tenant_id="${NEBIUS_TENANT_ID}" +export TF_VAR_iam_project_id="${NEBIUS_PROJECT_ID}" export TF_VAR_vpc_subnet_id="${NEBIUS_VPC_SUBNET_ID}" # Separate declaration and assignment for group editors NEBIUS_GROUP_EDITORS_ID=$(nebius iam group get-by-name \ - --parent-id "${NEBIUS_TENANT_ID}" \ - --name 'editors' \ - --format json \ - | jq -r '.metadata.id') + --parent-id "${NEBIUS_TENANT_ID}" \ + --name 'editors' \ + --format json \ + | jq -r '.metadata.id') # Separate declaration and assignment for service account NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account list \ @@ -52,35 +57,33 @@ fi # Check if service account is already a member of editors group IS_MEMBER=$(nebius iam group-membership list-members \ - --parent-id "${NEBIUS_GROUP_EDITORS_ID}" \ - --format json \ - | jq -r --arg SAID "${NEBIUS_SA_TERRAFORM_ID}" '.memberships[] | select(.spec.member_id == $SAID) | .spec.member_id') + --parent-id "${NEBIUS_GROUP_EDITORS_ID}" \ + --page-size 1000 \ + --format json \ + | jq -r --arg SAID "${NEBIUS_SA_TERRAFORM_ID}" '.memberships[] | select(.spec.member_id == $SAID) | .spec.member_id') + # Add service account to group editors only if not already a member if [ -z "${IS_MEMBER}" ]; then - nebius iam group-membership create \ - --parent-id "${NEBIUS_GROUP_EDITORS_ID}" \ - --member-id "${NEBIUS_SA_TERRAFORM_ID}" - echo "Added service account to editors group" + nebius iam group-membership create \ + --parent-id "${NEBIUS_GROUP_EDITORS_ID}" \ + --member-id "${NEBIUS_SA_TERRAFORM_ID}" + echo "Added service account to editors group" else - echo "Service account is already a member of editors group" + echo "Service account is already a member of editors group" fi # Separate declaration and assignment for access key -NEBIUS_SA_ACCESS_KEY_ID=$(nebius iam access-key list \ +echo 'Creating new access key for Object Storage' +NEBIUS_SA_ACCESS_KEY_ID=$(nebius iam access-key create \ --parent-id "${NEBIUS_PROJECT_ID}" \ + --name "slurm-tf-ak-$(date +%s)" \ + --account-service-account-id "${NEBIUS_SA_TERRAFORM_ID}" \ + --description 'Temporary S3 Access' \ + --expires-at "$(date -v+1d '+%Y-%m-%dT%H:%M:%SZ')" \ --format json \ - | jq -r '.items // [] | map(select(.metadata.name == "slurm-terraform-sa-access-key")) | .[0].metadata.id // empty') - -if [ -z "${NEBIUS_SA_ACCESS_KEY_ID}" ]; then - NEBIUS_SA_ACCESS_KEY_ID=$(nebius iam access-key create \ - --parent-id "${NEBIUS_PROJECT_ID}" \ - --name 'slurm-terraform-sa-access-key' \ - --account-service-account-id "${NEBIUS_SA_TERRAFORM_ID}" \ - --description 'AWS CLI key' \ - --format json \ - | jq -r '.resource_id') -fi + | jq -r '.resource_id') +echo "Created new access key: ${NEBIUS_SA_ACCESS_KEY_ID}" # Separate declaration and assignment for AWS access key AWS_ACCESS_KEY_ID=$(nebius iam access-key get-by-id \ @@ -88,9 +91,11 @@ AWS_ACCESS_KEY_ID=$(nebius iam access-key get-by-id \ --format json | jq -r '.status.aws_access_key_id') export AWS_ACCESS_KEY_ID -if [ -f '.aws_secret_access_key' ] && [ -s '.aws_secret_access_key' ]; then +if [ "$(tr -d '[:space:]' < .aws_secret_access_key | wc -c)" -gt 0 ]; then + echo "Using existing AWS_SECRET_ACCESS_KEY from .aws_secret_access_key" AWS_SECRET_ACCESS_KEY="$(cat '.aws_secret_access_key')" else + echo "Generating new AWS_SECRET_ACCESS_KEY" AWS_SECRET_ACCESS_KEY="$(nebius iam access-key get-secret-once \ --id "${NEBIUS_SA_ACCESS_KEY_ID}" \ --format json \ @@ -99,32 +104,30 @@ else fi if [ -z "${AWS_SECRET_ACCESS_KEY}" ]; then - echo "Error: AWS_SECRET_ACCESS_KEY is empty" >&2 - exit 1 + echo "!!! ERROR: AWS_SECRET_ACCESS_KEY is empty !!!" + echo " To generate a new key, first delete the existing key:" + echo " nebius iam access-key delete --id-id "${NEBIUS_SA_ACCESS_KEY_ID}"" + echo " Then rerun 'source .envrc' again." fi export AWS_SECRET_ACCESS_KEY -aws configure set aws_access_key_id "${AWS_ACCESS_KEY_ID}" -aws configure set aws_secret_access_key "${AWS_SECRET_ACCESS_KEY}" -aws configure set region 'eu-north1' -aws configure set endpoint_url 'https://storage.eu-north1.nebius.cloud:443' - -export NEBIUS_BUCKET_NAME="tfstate-slurm-k8s-$(echo -n "${NEBIUS_TENANT_ID}-${NEBIUS_PROJECT_ID}" | md5sum | awk '$0=$1')" +NEBIUS_BUCKET_NAME="tfstate-slurm-k8s-$(echo -n "${NEBIUS_TENANT_ID}-${NEBIUS_PROJECT_ID}" | md5sum | awk '$0=$1')" +export NEBIUS_BUCKET_NAME # Check if bucket exists EXISTING_BUCKET=$(nebius storage bucket list \ - --parent-id "${NEBIUS_PROJECT_ID}" \ - --format json \ - | jq -r --arg BUCKET "${NEBIUS_BUCKET_NAME}" '.items[] | select(.metadata.name == $BUCKET) | .metadata.name') + --parent-id "${NEBIUS_PROJECT_ID}" \ + --format json \ + | jq -r --arg BUCKET "${NEBIUS_BUCKET_NAME}" '.items[] | select(.metadata.name == $BUCKET) | .metadata.name') if [ -z "${EXISTING_BUCKET}" ]; then - nebius storage bucket create \ - --name "${NEBIUS_BUCKET_NAME}" \ - --parent-id "${NEBIUS_PROJECT_ID}" \ - --versioning-policy 'enabled' - echo "Created bucket: ${NEBIUS_BUCKET_NAME}" + nebius storage bucket create \ + --name "${NEBIUS_BUCKET_NAME}" \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --versioning-policy 'enabled' + echo "Created bucket: ${NEBIUS_BUCKET_NAME}" else - echo "Using existing bucket: ${NEBIUS_BUCKET_NAME}" + echo "Using existing bucket: ${NEBIUS_BUCKET_NAME}" fi export TFE_PARALLELISM=20 diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index 59c57207..7637d89d 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -190,30 +190,30 @@ module "slurm" { system = { cpu_cores = local.resources.system.cpu_cores memory_gibibytes = local.resources.system.memory_gibibytes - ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_system.boot_disk.size_gibibytes / 2) + ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_system.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes) } controller = { cpu_cores = local.resources.controller.cpu_cores memory_gibibytes = local.resources.controller.memory_gibibytes - ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_controller.boot_disk.size_gibibytes / 2) + ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_controller.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes) } worker = [for i, worker in var.slurm_nodeset_workers : { cpu_cores = local.resources.workers[i].cpu_cores memory_gibibytes = local.resources.workers[i].memory_gibibytes - ephemeral_storage_gibibytes = ceil(worker.boot_disk.size_gibibytes / 2) + ephemeral_storage_gibibytes = ceil(worker.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes) gpus = local.resources.workers[i].gpus } ] login = { cpu_cores = local.resources.login.cpu_cores memory_gibibytes = local.resources.login.memory_gibibytes - ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_login.boot_disk.size_gibibytes / 2) + ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_login.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes) } accounting = var.accounting_enabled ? { cpu_cores = local.resources.accounting.cpu_cores memory_gibibytes = local.resources.accounting.memory_gibibytes - ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_accounting.boot_disk.size_gibibytes / 2) + ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_accounting.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes) } : null } diff --git a/soperator/installations/example/terraform.tf b/soperator/installations/example/terraform.tf index 014e6175..011b7419 100644 --- a/soperator/installations/example/terraform.tf +++ b/soperator/installations/example/terraform.tf @@ -4,7 +4,7 @@ terraform { required_providers { nebius = { source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius" - version = "0.4.4" + version = ">=0.4" } units = { diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 2915f083..234fb12a 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -140,7 +140,7 @@ slurm_cluster_name = "soperator" # Version of soperator. # --- -slurm_operator_version = "1.15.5" +slurm_operator_version = "1.16.0" # Type of the Slurm partition config. Could be either `default` or `custom`. # By default, "default". diff --git a/soperator/modules/available_resources/main.tf b/soperator/modules/available_resources/main.tf index b432fde1..cd60bbf8 100644 --- a/soperator/modules/available_resources/main.tf +++ b/soperator/modules/available_resources/main.tf @@ -249,3 +249,7 @@ locals { }) }) } + +data "units_data_size" "k8s_ephemeral_storage_reserve" { + gibibytes = 64 +} diff --git a/soperator/modules/available_resources/outputs.tf b/soperator/modules/available_resources/outputs.tf index 59cd6cf6..2437aaca 100644 --- a/soperator/modules/available_resources/outputs.tf +++ b/soperator/modules/available_resources/outputs.tf @@ -2,3 +2,7 @@ output "this" { description = "Map of available node resources grouped by platform -> preset." value = local.resources } + +output "k8s_ephemeral_storage_reserve" { + value = data.units_data_size.k8s_ephemeral_storage_reserve +} diff --git a/soperator/modules/available_resources/terraform.tf b/soperator/modules/available_resources/terraform.tf index 300033b1..7e276c5c 100644 --- a/soperator/modules/available_resources/terraform.tf +++ b/soperator/modules/available_resources/terraform.tf @@ -1,3 +1,11 @@ +terraform { + required_providers { + units = { + source = "dstaroff/units" + } + } +} + module "labels" { source = "../labels" } diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index d52d99e6..9b6c6abb 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -127,6 +127,10 @@ resource "helm_release" "slurm_operator" { name = "controllerManager.manager.env.isMariadbCrdInstalled" value = var.accounting_enabled } + set { + name = "certManager.enabled" + value = var.telemetry_enabled + } wait = true wait_for_jobs = true @@ -163,15 +167,16 @@ resource "helm_release" "slurm_cluster" { nccl_topology_type = var.nccl_topology_type nccl_benchmark = { - enable = var.nccl_benchmark_enable - schedule = var.nccl_benchmark_schedule - min_threshold = var.nccl_benchmark_min_threshold - use_infiniband = var.nccl_use_infiniband + enable = var.nccl_benchmark_enable + schedule = var.nccl_benchmark_schedule + min_threshold = var.nccl_benchmark_min_threshold + use_infiniband = var.nccl_use_infiniband } nodes = { accounting = { - enabled = var.accounting_enabled + enabled = var.accounting_enabled + use_protected_secret = var.use_protected_secret mariadb_operator = var.accounting_enabled ? { enabled = var.accounting_enabled storage_size = var.accounting_enabled ? var.filestores.accounting.size_gibibytes : 0 diff --git a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl index c420d949..99a8b015 100644 --- a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl @@ -138,6 +138,7 @@ slurmNodes: mariadbOperator: enabled: ${nodes.accounting.mariadb_operator.enabled} %{~ if nodes.accounting.mariadb_operator.enabled ~} + protectedSecret: ${nodes.accounting.use_protected_secret} resources: cpu: ${nodes.accounting.mariadb_operator.resources.cpu * 1000}m memory: ${nodes.accounting.mariadb_operator.resources.memory}Gi diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index c9db7d15..ad850a12 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -248,6 +248,12 @@ variable "accounting_enabled" { default = false } +variable "use_protected_secret" { + description = "If true, protected user secret MariaDB will not be deleted after the MariaDB CR is deleted." + type = bool + default = false +} + variable "slurmdbd_config" { description = "Slurmdbd.conf configuration. See https://slurm.schedmd.com/slurmdbd.conf.html.Not all options are supported." type = map(any)