From 877f28c5ef83a565335ec7bbad1afe5126538143 Mon Sep 17 00:00:00 2001 From: Uburro Date: Wed, 13 Nov 2024 12:22:35 +0100 Subject: [PATCH 1/9] NoTASK: increse deafult resources vmagent --- soperator/modules/monitoring/variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/soperator/modules/monitoring/variables.tf b/soperator/modules/monitoring/variables.tf index 7a935fb9..c95ee6c7 100644 --- a/soperator/modules/monitoring/variables.tf +++ b/soperator/modules/monitoring/variables.tf @@ -54,8 +54,8 @@ variable "resources_vm_agent" { cpu = string }) default = { - memory = "384Mi" - cpu = "250m" + memory = "1Gi" + cpu = "500m" } } From 5add12896f252dca51266208a6cb3d67ee143cf0 Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Wed, 13 Nov 2024 14:56:29 +0100 Subject: [PATCH 2/9] [FIX] Filter ServiceAccount by name with jq `filter` field is not implemented --- soperator/installations/example/.envrc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/soperator/installations/example/.envrc b/soperator/installations/example/.envrc index 0b8797c1..bbc24d82 100644 --- a/soperator/installations/example/.envrc +++ b/soperator/installations/example/.envrc @@ -36,9 +36,8 @@ NEBIUS_GROUP_EDITORS_ID=$(nebius iam group get-by-name \ # Separate declaration and assignment for service account NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account list \ --parent-id "${NEBIUS_PROJECT_ID}" \ - --filter "name=slurm-terraform-sa" \ --format json \ - | jq -r '.items[0].metadata.id') + | jq -r '.items[] | select(.metadata.name == "slurm-terraform-sa").metadata.id') if [ -z "$NEBIUS_SA_TERRAFORM_ID" ]; then NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account create \ From 397666f44aac401f7796725e6b6ed6b67d9ad960 Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Wed, 13 Nov 2024 14:57:18 +0100 Subject: [PATCH 3/9] [FIX] Division validation for worker nodeset --- soperator/installations/example/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index aca59ab6..b6fa95f5 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -331,7 +331,7 @@ variable "slurm_nodeset_workers" { validation { condition = length([for worker in var.slurm_nodeset_workers : 1 if worker.size % worker.split_factor != 0 - ]) == 1 + ]) == 0 error_message = "Worker count must be divisible by split_factor." } } From bad1ad16628773c0591e7da02f72495468f7ab57 Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Fri, 15 Nov 2024 12:42:36 +0100 Subject: [PATCH 4/9] [ADD] Support for new platforms --- soperator/modules/available_resources/main.tf | 407 +++++++++++------- 1 file changed, 243 insertions(+), 164 deletions(-) diff --git a/soperator/modules/available_resources/main.tf b/soperator/modules/available_resources/main.tf index f62eb54f..b432fde1 100644 --- a/soperator/modules/available_resources/main.tf +++ b/soperator/modules/available_resources/main.tf @@ -1,172 +1,251 @@ locals { + cpu = { + c-2vcpu-8gb = { + cpu_cores = 2 - 1 + memory_gibibytes = 8 - 2 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = false + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = false + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-4vcpu-32gb = { + cpu_cores = 4 - 1 + memory_gibibytes = 16 - 2 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = false + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-8vcpu-32gb = { + cpu_cores = 8 - 2 + memory_gibibytes = 32 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-16vcpu-64gb = { + cpu_cores = 16 - 2 + memory_gibibytes = 64 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-32vcpu-128gb = { + cpu_cores = 32 - 2 + memory_gibibytes = 128 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-48vcpu-192gb = { + cpu_cores = 48 - 2 + memory_gibibytes = 192 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-64vcpu-256gb = { + cpu_cores = 64 - 2 + memory_gibibytes = 256 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-80vcpu-320gb = { + cpu_cores = 80 - 2 + memory_gibibytes = 320 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-96vcpu-384gb = { + cpu_cores = 96 - 2 + memory_gibibytes = 384 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-128vcpu-512gb = { + cpu_cores = 128 - 2 + memory_gibibytes = 512 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-160vcpu-640gb = { + cpu_cores = 160 - 2 + memory_gibibytes = 640 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-192vcpu-768gb = { + cpu_cores = 192 - 2 + memory_gibibytes = 768 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-224vcpu-896gb = { + cpu_cores = 224 - 2 + memory_gibibytes = 896 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + c-256vcpu-1024gb = { + cpu_cores = 256 - 2 + memory_gibibytes = 1024 - 10 + gpus = 0 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + } + + gpu = { + g-1gpu-16vcpu-200gb = { + cpu_cores = 16 - 2 + memory_gibibytes = 200 - 15 + gpus = 1 + gpu_cluster_compatible = false + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + g-8gpu-128vcpu-1600gb = { + cpu_cores = 128 - 2 + memory_gibibytes = 1600 - 350 + gpus = 8 + gpu_cluster_compatible = true + sufficient = { + (module.labels.name_nodeset_system) = true + (module.labels.name_nodeset_controller) = true + (module.labels.name_nodeset_worker) = true + (module.labels.name_nodeset_login) = true + (module.labels.name_nodeset_accounting) = true + } + } + } + resources = tomap({ "cpu-e2" = tomap({ - "2vcpu-8gb" = { - cpu_cores = 2 - 1 - memory_gibibytes = 8 - 2 - gpus = 0 - gpu_cluster_compatible = false - sufficient = { - (module.labels.name_nodeset_system) = false - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = false - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } - "4vcpu-16gb" = { - cpu_cores = 4 - 1 - memory_gibibytes = 16 - 2 - gpus = 0 - gpu_cluster_compatible = false - sufficient = { - (module.labels.name_nodeset_system) = true - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = false - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } - "8vcpu-32gb" = { - cpu_cores = 8 - 2 - memory_gibibytes = 32 - 10 - gpus = 0 - gpu_cluster_compatible = false - sufficient = { - (module.labels.name_nodeset_system) = true - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = true - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } - "16vcpu-64gb" = { - cpu_cores = 16 - 2 - memory_gibibytes = 64 - 10 - gpus = 0 - gpu_cluster_compatible = false - sufficient = { - (module.labels.name_nodeset_system) = true - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = true - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } - "32vcpu-128gb" = { - cpu_cores = 32 - 2 - memory_gibibytes = 128 - 10 - gpus = 0 - gpu_cluster_compatible = false - sufficient = { - (module.labels.name_nodeset_system) = true - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = true - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } - "48vcpu-192gb" = { - cpu_cores = 48 - 2 - memory_gibibytes = 192 - 10 - gpus = 0 - gpu_cluster_compatible = false - sufficient = { - (module.labels.name_nodeset_system) = true - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = true - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } - "64vcpu-256gb" = { - cpu_cores = 64 - 2 - memory_gibibytes = 256 - 10 - gpus = 0 - gpu_cluster_compatible = false - sufficient = { - (module.labels.name_nodeset_system) = true - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = true - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } - "80vcpu-320gb" = { - cpu_cores = 80 - 2 - memory_gibibytes = 320 - 10 - gpus = 0 - gpu_cluster_compatible = false - sufficient = { - (module.labels.name_nodeset_system) = true - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = true - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } + "2vcpu-8gb" = local.cpu.c-2vcpu-8gb + "4vcpu-16gb" = local.cpu.c-4vcpu-32gb + "8vcpu-32gb" = local.cpu.c-8vcpu-32gb + "16vcpu-64gb" = local.cpu.c-16vcpu-64gb + "32vcpu-128gb" = local.cpu.c-32vcpu-128gb + "48vcpu-192gb" = local.cpu.c-48vcpu-192gb + "64vcpu-256gb" = local.cpu.c-64vcpu-256gb + "80vcpu-320gb" = local.cpu.c-80vcpu-320gb + }) + "cpu-d3" = tomap({ + "2vcpu-8gb" = local.cpu.c-2vcpu-8gb + "4vcpu-16gb" = local.cpu.c-4vcpu-32gb + "8vcpu-32gb" = local.cpu.c-8vcpu-32gb + "16vcpu-64gb" = local.cpu.c-16vcpu-64gb + "32vcpu-128gb" = local.cpu.c-32vcpu-128gb + "48vcpu-192gb" = local.cpu.c-48vcpu-192gb + "64vcpu-256gb" = local.cpu.c-64vcpu-256gb + "96vcpu-384gb" = local.cpu.c-96vcpu-384gb + "128vcpu-512gb" = local.cpu.c-128vcpu-512gb + "160vcpu-640gb" = local.cpu.c-160vcpu-640gb + "192vcpu-768gb" = local.cpu.c-192vcpu-768gb + "224vcpu-896gb" = local.cpu.c-224vcpu-896gb + "256vcpu-1024gb" = local.cpu.c-256vcpu-1024gb }) "gpu-h100-sxm" = tomap({ - "1gpu-16vcpu-200gb" = { - cpu_cores = 16 - 2 - memory_gibibytes = 200 - 15 - gpus = 1 - gpu_cluster_compatible = false - sufficient = { - (module.labels.name_nodeset_system) = true - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = true - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } - "8gpu-128vcpu-1600gb" = { - cpu_cores = 128 - 2 - memory_gibibytes = 1600 - 350 - gpus = 8 - gpu_cluster_compatible = true - sufficient = { - (module.labels.name_nodeset_system) = true - (module.labels.name_nodeset_controller) = true - (module.labels.name_nodeset_worker) = true - (module.labels.name_nodeset_login) = true - (module.labels.name_nodeset_accounting) = true - } - } + "1gpu-16vcpu-200gb" = local.gpu.g-1gpu-16vcpu-200gb + "8gpu-128vcpu-1600gb" = local.gpu.g-8gpu-128vcpu-1600gb + }) + "gpu-h200-sxm" = tomap({ + "1gpu-16vcpu-200gb" = local.gpu.g-1gpu-16vcpu-200gb + "8gpu-128vcpu-1600gb" = local.gpu.g-8gpu-128vcpu-1600gb }) - - # gpu-l40s-a is not supported - # "gpu-l40s-a" = tomap({ - # "1gpu-8vcpu-32gb" = { - # cpu_cores = 8 - 2 - # memory_gibibytes = 32 - 10 - # gpus = 1 - # gpu_cluster_compatible = false - # } - # "1gpu-16vcpu-64gb" = { - # cpu_cores = 16 - 2 - # memory_gibibytes = 64 - 10 - # gpus = 1 - # gpu_cluster_compatible = false - # } - # "1gpu-24vcpu-96gb" = { - # cpu_cores = 24 - 2 - # memory_gibibytes = 96 - 10 - # gpus = 1 - # gpu_cluster_compatible = false - # } - # "1gpu-32vcpu-128gb" = { - # cpu_cores = 32 - 2 - # memory_gibibytes = 128 - 10 - # gpus = 1 - # gpu_cluster_compatible = false - # } - # "1gpu-40vcpu-160gb" = { - # cpu_cores = 40 - 2 - # memory_gibibytes = 160 - 10 - # gpus = 1 - # gpu_cluster_compatible = false - # } - # }) }) } From d2fa5f3dc223ea93ca86b6e30147d6f0282a7355 Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Fri, 15 Nov 2024 13:24:45 +0100 Subject: [PATCH 5/9] [FIX] Remove K8s version from node groups --- soperator/modules/k8s/k8s_ng_accounting.tf | 1 - soperator/modules/k8s/k8s_ng_controller.tf | 1 - soperator/modules/k8s/k8s_ng_login.tf | 1 - soperator/modules/k8s/k8s_ng_system.tf | 1 - soperator/modules/k8s/k8s_ng_workers.tf | 1 - 5 files changed, 5 deletions(-) diff --git a/soperator/modules/k8s/k8s_ng_accounting.tf b/soperator/modules/k8s/k8s_ng_accounting.tf index 047d4366..fc03529e 100644 --- a/soperator/modules/k8s/k8s_ng_accounting.tf +++ b/soperator/modules/k8s/k8s_ng_accounting.tf @@ -14,7 +14,6 @@ resource "nebius_mk8s_v1_node_group" "accounting" { module.labels.label_workload_cpu, ) - version = var.k8s_version fixed_node_count = 1 template = { diff --git a/soperator/modules/k8s/k8s_ng_controller.tf b/soperator/modules/k8s/k8s_ng_controller.tf index 2ba31f80..da85cdfe 100644 --- a/soperator/modules/k8s/k8s_ng_controller.tf +++ b/soperator/modules/k8s/k8s_ng_controller.tf @@ -12,7 +12,6 @@ resource "nebius_mk8s_v1_node_group" "controller" { module.labels.label_workload_cpu, ) - version = var.k8s_version fixed_node_count = var.node_group_controller.size template = { diff --git a/soperator/modules/k8s/k8s_ng_login.tf b/soperator/modules/k8s/k8s_ng_login.tf index 890fc9ab..664944de 100644 --- a/soperator/modules/k8s/k8s_ng_login.tf +++ b/soperator/modules/k8s/k8s_ng_login.tf @@ -12,7 +12,6 @@ resource "nebius_mk8s_v1_node_group" "login" { module.labels.label_workload_cpu, ) - version = var.k8s_version fixed_node_count = var.node_group_login.size template = { diff --git a/soperator/modules/k8s/k8s_ng_system.tf b/soperator/modules/k8s/k8s_ng_system.tf index 3c3565a6..018cefc2 100644 --- a/soperator/modules/k8s/k8s_ng_system.tf +++ b/soperator/modules/k8s/k8s_ng_system.tf @@ -12,7 +12,6 @@ resource "nebius_mk8s_v1_node_group" "system" { module.labels.label_workload_cpu, ) - version = var.k8s_version fixed_node_count = var.node_group_system.size template = { diff --git a/soperator/modules/k8s/k8s_ng_workers.tf b/soperator/modules/k8s/k8s_ng_workers.tf index fc4e1d26..e197e6f0 100644 --- a/soperator/modules/k8s/k8s_ng_workers.tf +++ b/soperator/modules/k8s/k8s_ng_workers.tf @@ -52,7 +52,6 @@ resource "nebius_mk8s_v1_node_group" "worker" { local.node_group_workload_label.worker[count.index], ) - version = var.k8s_version fixed_node_count = var.node_group_workers[count.index].size strategy = { max_unavailable = { From 5663cac8581fb8fcfbcf1b3de77f96a8a6383011 Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Fri, 15 Nov 2024 13:26:05 +0100 Subject: [PATCH 6/9] [FIX] Omit accounting resources when it's disabled --- soperator/installations/example/main.tf | 6 +++--- soperator/modules/slurm/main.tf | 8 ++++---- .../slurm/templates/helm_values/slurm_cluster.yaml.tftpl | 6 ++++-- soperator/modules/slurm/variables.tf | 4 ++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index 43932fa8..a6d3993a 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -4,7 +4,7 @@ locals { controller = module.resources.this[var.slurm_nodeset_controller.resource.platform][var.slurm_nodeset_controller.resource.preset] workers = [for worker in var.slurm_nodeset_workers : module.resources.this[worker.resource.platform][worker.resource.preset]] login = module.resources.this[var.slurm_nodeset_login.resource.platform][var.slurm_nodeset_login.resource.preset] - accounting = module.resources.this[var.slurm_nodeset_accounting.resource.platform][var.slurm_nodeset_accounting.resource.preset] + accounting = var.slurm_nodeset_accounting != null ? module.resources.this[var.slurm_nodeset_accounting.resource.platform][var.slurm_nodeset_accounting.resource.preset] : null } use_node_port = var.slurm_login_service_type == "NodePort" @@ -210,11 +210,11 @@ module "slurm" { memory_gibibytes = local.resources.login.memory_gibibytes ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_login.boot_disk.size_gibibytes / 2) } - accounting = { + accounting = var.accounting_enabled ? { cpu_cores = local.resources.accounting.cpu_cores memory_gibibytes = local.resources.accounting.memory_gibibytes ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_accounting.boot_disk.size_gibibytes / 2) - } + } : null } login_service_type = var.slurm_login_service_type diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index e4cc76d6..02670d78 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -171,19 +171,19 @@ resource "helm_release" "slurm_cluster" { nodes = { accounting = { enabled = var.accounting_enabled - mariadb_operator = { + mariadb_operator = var.accounting_enabled ? { enabled = var.accounting_enabled storage_size = var.accounting_enabled ? var.filestores.accounting.size_gibibytes : 0 metrics_enabled = var.telemetry_enabled resources = local.resources.mariadb - } + } : null slurmdbd_config = var.slurmdbd_config slurm_config = var.slurm_accounting_config - resources = { + resources = var.accounting_enabled ? { cpu = var.resources.accounting.cpu_cores - local.resources.munge.cpu - local.resources.mariadb.cpu memory = var.resources.accounting.memory_gibibytes - local.resources.munge.memory - local.resources.mariadb.memory ephemeral_storage = var.resources.accounting.ephemeral_storage_gibibytes - local.resources.munge.ephemeral_storage - local.resources.mariadb.ephemeral_storage - } + } : null } controller = { diff --git a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl index fcadb47a..66457052 100644 --- a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl @@ -133,6 +133,7 @@ slurmNodes: accounting: enabled: ${nodes.accounting.enabled} k8sNodeFilterName: ${k8s_node_filters.accounting.name} + %{~ if nodes.accounting.enabled ~} mariadbOperator: enabled: ${nodes.accounting.mariadb_operator.enabled} %{~ if nodes.accounting.mariadb_operator.enabled ~} @@ -154,13 +155,13 @@ slurmNodes: storageClassName: slurm-local-pv storageClassName: slurm-local-pv %{~ endif ~} - %{~ if nodes.accounting.enabled && length(nodes.accounting.slurmdbd_config) > 0 ~} + %{~ if length(nodes.accounting.slurmdbd_config) > 0 ~} slurmdbdConfig: %{~ for key, value in nodes.accounting.slurmdbd_config ~} ${key}: "${value}" %{~ endfor ~} %{~ endif ~} - %{~ if nodes.accounting.enabled && length(nodes.accounting.slurm_config) > 0 ~} + %{~ if length(nodes.accounting.slurm_config) > 0 ~} slurmConfig: %{~ for key, value in nodes.accounting.slurm_config ~} ${key}: "${value}" @@ -176,6 +177,7 @@ slurmNodes: cpu: ${nodes.munge.resources.cpu * 1000}m memory: ${nodes.munge.resources.memory}Gi ephemeralStorage: ${nodes.munge.resources.ephemeral_storage}Gi + %{~ endif ~} controller: size: ${nodes.controller.size} diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index 12a44df1..2eef573b 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -65,11 +65,11 @@ variable "resources" { memory_gibibytes = number ephemeral_storage_gibibytes = number }) - accounting = object({ + accounting = optional(object({ cpu_cores = number memory_gibibytes = number ephemeral_storage_gibibytes = number - }) + })) }) validation { From fae40526afff46fb67ffc206e5436b54584fe4c8 Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Wed, 13 Nov 2024 12:28:53 +0100 Subject: [PATCH 7/9] [FIX] Create shared containers directory --- soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub | 1 + soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub b/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub index b577694f..1c3645ef 100755 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub @@ -234,6 +234,7 @@ cleanup_preload_shared() { ######################################################################### if [ -n "${CONTAINER_PRELOAD_SHARED_PATH}" ]; then CONT_FILE="${CONTAINER_PRELOAD_SHARED_PATH}/containers/${SLURM_JOBID}_$(basename ${CONT}).squashfs" + mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers" # Prepull container image to the shared filesystem srun --ntasks=1 enroot import --output ${CONT_FILE} docker://${CONT} else diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh index 7b66f445..ddf4bc19 100755 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh @@ -137,4 +137,3 @@ sbatch \ run.sub squeue - From c917eb4ddada54db5debc1948a9d46b4af74194f Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Fri, 15 Nov 2024 13:29:27 +0100 Subject: [PATCH 8/9] [FIX] Remove registry from version sync --- soperator/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/soperator/Makefile b/soperator/Makefile index b1f99b7d..63d58874 100644 --- a/soperator/Makefile +++ b/soperator/Makefile @@ -4,7 +4,6 @@ SHELL = /usr/bin/env bash -o pipefail SOPERATOR_VERSION = $(shell cat VERSION) SUBVERSION = $(shell cat SUBVERSION) VERSION = $(SOPERATOR_VERSION)-$(SUBVERSION) -DOCKER_REGISTRY_NAME = soperator ifeq ($(shell uname), Darwin) SED_COMMAND = sed -i '' @@ -16,10 +15,6 @@ endif sync-version: ## Sync Soperator version from file @echo 'Soperator version is - $(SOPERATOR_VERSION)' - @# region modules/slurm/locals.tf - @$(SED_COMMAND) "s|\(oci://cr.eu-north1.nebius.cloud/\)[^\"]*|\1$(DOCKER_REGISTRY_NAME)|" modules/slurm/locals.tf - @# endregion modules/slurm/locals.tf - @# region installations/example/terraform.tfvars @echo 'Syncing installations/example/terraform.tfvars' @$(SED_COMMAND) -E 's/slurm_operator_version *= *"[0-9]+.[0-9]+.[0-9]+[^ ]*"/slurm_operator_version = "$(SOPERATOR_VERSION)"/' installations/example/terraform.tfvars From 009c412348452fc559880f044ec6c3f35abb4418 Mon Sep 17 00:00:00 2001 From: Dmitry Starov Date: Fri, 15 Nov 2024 14:20:55 +0100 Subject: [PATCH 9/9] Bump version --- soperator/SUBVERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/soperator/SUBVERSION b/soperator/SUBVERSION index 573541ac..d00491fd 100644 --- a/soperator/SUBVERSION +++ b/soperator/SUBVERSION @@ -1 +1 @@ -0 +1