Merge pull request #45 from nebius/dev/cpu-only-workers

Basic support for CPU-only workers
nebius · Oct 22, 2024 · b165086 · b165086
2 parents 3b6e5a1 + 13c95e8
commit b165086
Show file tree

Hide file tree

Showing 9 changed files with 139 additions and 30 deletions.
diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf
@@ -1,5 +1,7 @@
 locals {
   create_nlb = var.slurm_login_service_type == "NodePort"
+
+  worker_resources = module.resources.this[var.k8s_cluster_node_group_gpu.resource.platform][var.k8s_cluster_node_group_gpu.resource.preset]
 }
 
 module "filestore" {
@@ -116,6 +118,8 @@ module "k8s" {
 }
 
 module "nvidia_operator_network" {
+  count = local.worker_resources.gpus > 0 ? 1 : 0
+
   depends_on = [
     module.k8s
   ]
@@ -131,6 +135,8 @@ module "nvidia_operator_network" {
 }
 
 module "nvidia_operator_gpu" {
+  count = local.worker_resources.gpus > 0 ? 1 : 0
+
   depends_on = [
     module.nvidia_operator_network
   ]
@@ -149,7 +155,7 @@ module "nvidia_operator_gpu" {
 
 module "slurm" {
   depends_on = [
-    module.k8s
+    module.k8s,
   ]
 
   source = "../../modules/slurm"
@@ -159,20 +165,12 @@ module "slurm" {
 
   node_count = var.slurm_node_count
 
-  worker_resources = tomap({
-    "8gpu-128vcpu-1600gb" = {
-      cpu_cores                   = 128 - 48
-      memory_gibibytes            = 1600 - 400
-      ephemeral_storage_gibibytes = ceil(var.k8s_cluster_node_group_gpu.boot_disk.size_gibibytes / 2)
-      gpus                        = 8
-    }
-    "1gpu-20vcpu-200gb" = {
-      cpu_cores                   = 20 - 4
-      memory_gibibytes            = 200 - 50
-      ephemeral_storage_gibibytes = ceil(var.k8s_cluster_node_group_gpu.boot_disk.size_gibibytes / 2)
-      gpus                        = 1
-    }
-  })[var.k8s_cluster_node_group_gpu.resource.preset]
+  worker_resources = {
+    cpu_cores                   = local.worker_resources.cpu_cores
+    memory_gibibytes            = local.worker_resources.memory_gibibytes
+    ephemeral_storage_gibibytes = ceil(var.k8s_cluster_node_group_gpu.boot_disk.size_gibibytes / 2)
+    gpus                        = local.worker_resources.gpus
+  }
 
   login_service_type         = var.slurm_login_service_type
   login_node_port            = var.slurm_login_node_port
@@ -184,7 +182,6 @@ module "slurm" {
   slurmdbd_config         = var.slurmdbd_config
   slurm_accounting_config = var.slurm_accounting_config
 
-  # TODO: MSP-2817 - use computed values of filestore sizes
   filestores = {
     controller_spool = {
       size_gibibytes = module.filestore.controller_spool.size_gibibytes

diff --git a/soperator/installations/example/terraform.tf b/soperator/installations/example/terraform.tf
@@ -41,3 +41,7 @@ provider "helm" {
     token                  = var.iam_token
   }
 }
+
+module "resources" {
+  source = "../../modules/available_resources"
+}
diff --git a/soperator/modules/available_resources/main.tf b/soperator/modules/available_resources/main.tf
@@ -0,0 +1,92 @@
+locals {
+  # TODO: Get to know exact amount of allocatable resources
+  resources = tomap({
+    "cpu-e2" = tomap({
+      # Insufficient resource presets
+      # 2vcpu-8gb
+      # 4vcpu-16gb
+      "8vcpu-32gb" = {
+        cpu_cores              = 8 - 2
+        memory_gibibytes       = 32 - 10
+        gpus                   = 0
+        gpu_cluster_compatible = false
+      }
+      "16vcpu-64gb" = {
+        cpu_cores              = 16 - 2
+        memory_gibibytes       = 64 - 10
+        gpus                   = 0
+        gpu_cluster_compatible = false
+      }
+      "32vcpu-128gb" = {
+        cpu_cores              = 32 - 2
+        memory_gibibytes       = 128 - 10
+        gpus                   = 0
+        gpu_cluster_compatible = false
+      }
+      "48vcpu-192gb" = {
+        cpu_cores              = 48 - 2
+        memory_gibibytes       = 192 - 10
+        gpus                   = 0
+        gpu_cluster_compatible = false
+      }
+      "64vcpu-256gb" = {
+        cpu_cores              = 64 - 2
+        memory_gibibytes       = 256 - 10
+        gpus                   = 0
+        gpu_cluster_compatible = false
+      }
+      "80vcpu-320gb" = {
+        cpu_cores              = 80 - 2
+        memory_gibibytes       = 320 - 10
+        gpus                   = 0
+        gpu_cluster_compatible = false
+      }
+    })
+    "gpu-h100-sxm" = tomap({
+      "1gpu-16vcpu-200gb" = {
+        cpu_cores              = 16 - 2
+        memory_gibibytes       = 200 - 15
+        gpus                   = 1
+        gpu_cluster_compatible = false
+      }
+      "8gpu-128vcpu-1600gb" = {
+        cpu_cores              = 128 - 2
+        memory_gibibytes       = 1600 - 350
+        gpus                   = 8
+        gpu_cluster_compatible = true
+      }
+    })
+    "gpu-l40s-a" = tomap({
+      "1gpu-8vcpu-32gb" = {
+        cpu_cores              = 8 - 2
+        memory_gibibytes       = 32 - 10
+        gpus                   = 1
+        gpu_cluster_compatible = false
+      }
+      "1gpu-16vcpu-64gb" = {
+        cpu_cores              = 16 - 2
+        memory_gibibytes       = 64 - 10
+        gpus                   = 1
+        gpu_cluster_compatible = false
+      }
+      "1gpu-24vcpu-96gb" = {
+        cpu_cores              = 24 - 2
+        memory_gibibytes       = 96 - 10
+        gpus                   = 1
+        gpu_cluster_compatible = false
+      }
+      "1gpu-32vcpu-128gb" = {
+        cpu_cores              = 32 - 2
+        memory_gibibytes       = 128 - 10
+        gpus                   = 1
+        gpu_cluster_compatible = false
+      }
+      "1gpu-40vcpu-160gb" = {
+        cpu_cores              = 40 - 2
+        memory_gibibytes       = 160 - 10
+        gpus                   = 1
+        gpu_cluster_compatible = false
+      }
+    })
+  })
+}
diff --git a/soperator/modules/available_resources/outputs.tf b/soperator/modules/available_resources/outputs.tf
@@ -0,0 +1,4 @@
+output "this" {
+  description = "Map of available node resources grouped by platform -> preset."
+  value       = local.resources
+}
diff --git a/soperator/modules/k8s/k8s_ng_gpu.tf b/soperator/modules/k8s/k8s_ng_gpu.tf
@@ -1,11 +1,7 @@
 locals {
   gpu = {
     cluster = {
-      create = tomap({
-        "8gpu-128vcpu-1600gb" = true
-        "1gpu-20vcpu-200gb"   = false
-      })[var.node_group_gpu.resource.preset]
-
+      create = module.resources.this[var.node_group_gpu.resource.platform][var.node_group_gpu.resource.preset].gpu_cluster_compatible
       name = join("-", [
         trimsuffix(
           substr(
@@ -18,11 +14,6 @@ locals {
         var.node_group_gpu.gpu_cluster.infiniband_fabric
       ])
     }
-
-    count = tomap({
-      "8gpu-128vcpu-1600gb" = 8
-      "1gpu-20vcpu-200gb"   = 1
-    })[var.node_group_gpu.resource.preset]
   }
 }
 
@@ -62,11 +53,11 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
     metadata = {
       labels = module.labels.label_group_name_gpu
     }
-    taints = [{
+    taints = module.resources.this[var.node_group_gpu.resource.platform][var.node_group_gpu.resource.preset].gpus > 0 ? [{
       key    = "nvidia.com/gpu",
-      value  = local.gpu.count
+      value  = module.resources.this[var.node_group_gpu.resource.platform][var.node_group_gpu.resource.preset].gpus
       effect = "NO_SCHEDULE"
-    }]
+    }] : null
 
     resources = {
       platform = var.node_group_gpu.resource.platform
@@ -105,5 +96,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
     ignore_changes = [
       labels,
     ]
+
+    precondition {
+      condition = (var.node_group_gpu.resource.platform == "cpu-e2"
+        ? !contains(["2vcpu-8gb", "4vcpu-16gb"], var.node_group_gpu.resource.preset)
+        : true
+      )
+      error_message = "Worker resource preset '${var.node_group_gpu.resource.preset}' is insufficient."
+    }
   }
 }
diff --git a/soperator/modules/k8s/terraform.tf b/soperator/modules/k8s/terraform.tf
@@ -13,3 +13,7 @@ terraform {
 module "labels" {
   source = "../labels"
 }
+
+module "resources" {
+  source = "../available_resources"
+}
diff --git a/soperator/modules/login/main.tf b/soperator/modules/login/main.tf
@@ -21,7 +21,7 @@ resource "local_file" "this" {
     terraform_data.connection_ip,
   ]
 
-  filename        = "${path.root}/login.sh"
+  filename        = "${path.root}/${var.script_name}.sh"
   file_permission = "0774"
   content = templatefile("${path.module}/templates/login.sh.tftpl", {
     address = terraform_data.connection_ip.output

diff --git a/soperator/modules/login/variables.tf b/soperator/modules/login/variables.tf
@@ -15,3 +15,9 @@ variable "slurm_cluster_name" {
   type        = string
   nullable    = false
 }
+
+variable "script_name" {
+  description = "Name of the script file."
+  type        = string
+  default     = "login"
+}
diff --git a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl
@@ -1,4 +1,5 @@
 clusterName: ${name}
+clusterType: ${ nodes.worker.resources.gpus > 0 ? "gpu" : "cpu" }
 
 k8sNodeFilters:
   - name: ${k8s_node_filters.non_gpu.name}
@@ -22,10 +23,12 @@ k8sNodeFilters:
                   operator: In
                   values:
                     - ${k8s_node_filters.gpu.affinity.value}
+    %{~ if nodes.worker.resources.gpus > 0 ~}
     tolerations:
       - key: nvidia.com/gpu
         operator: Exists
         effect: NoSchedule
+    %{~ endif ~}
 
 volumeSources:
   - name: jail