Skip to content

Commit

Permalink
Merge pull request #45 from nebius/dev/cpu-only-workers
Browse files Browse the repository at this point in the history
Basic support for CPU-only workers
  • Loading branch information
dstaroff authored Oct 22, 2024
2 parents 3b6e5a1 + 13c95e8 commit b165086
Show file tree
Hide file tree
Showing 9 changed files with 139 additions and 30 deletions.
29 changes: 13 additions & 16 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
locals {
create_nlb = var.slurm_login_service_type == "NodePort"

worker_resources = module.resources.this[var.k8s_cluster_node_group_gpu.resource.platform][var.k8s_cluster_node_group_gpu.resource.preset]
}

module "filestore" {
Expand Down Expand Up @@ -116,6 +118,8 @@ module "k8s" {
}

module "nvidia_operator_network" {
count = local.worker_resources.gpus > 0 ? 1 : 0

depends_on = [
module.k8s
]
Expand All @@ -131,6 +135,8 @@ module "nvidia_operator_network" {
}

module "nvidia_operator_gpu" {
count = local.worker_resources.gpus > 0 ? 1 : 0

depends_on = [
module.nvidia_operator_network
]
Expand All @@ -149,7 +155,7 @@ module "nvidia_operator_gpu" {

module "slurm" {
depends_on = [
module.k8s
module.k8s,
]

source = "../../modules/slurm"
Expand All @@ -159,20 +165,12 @@ module "slurm" {

node_count = var.slurm_node_count

worker_resources = tomap({
"8gpu-128vcpu-1600gb" = {
cpu_cores = 128 - 48
memory_gibibytes = 1600 - 400
ephemeral_storage_gibibytes = ceil(var.k8s_cluster_node_group_gpu.boot_disk.size_gibibytes / 2)
gpus = 8
}
"1gpu-20vcpu-200gb" = {
cpu_cores = 20 - 4
memory_gibibytes = 200 - 50
ephemeral_storage_gibibytes = ceil(var.k8s_cluster_node_group_gpu.boot_disk.size_gibibytes / 2)
gpus = 1
}
})[var.k8s_cluster_node_group_gpu.resource.preset]
worker_resources = {
cpu_cores = local.worker_resources.cpu_cores
memory_gibibytes = local.worker_resources.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.k8s_cluster_node_group_gpu.boot_disk.size_gibibytes / 2)
gpus = local.worker_resources.gpus
}

login_service_type = var.slurm_login_service_type
login_node_port = var.slurm_login_node_port
Expand All @@ -184,7 +182,6 @@ module "slurm" {
slurmdbd_config = var.slurmdbd_config
slurm_accounting_config = var.slurm_accounting_config

# TODO: MSP-2817 - use computed values of filestore sizes
filestores = {
controller_spool = {
size_gibibytes = module.filestore.controller_spool.size_gibibytes
Expand Down
4 changes: 4 additions & 0 deletions soperator/installations/example/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ provider "helm" {
token = var.iam_token
}
}

module "resources" {
source = "../../modules/available_resources"
}
92 changes: 92 additions & 0 deletions soperator/modules/available_resources/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
locals {
# TODO: Get to know exact amount of allocatable resources
resources = tomap({
"cpu-e2" = tomap({
# Insufficient resource presets
# 2vcpu-8gb
# 4vcpu-16gb
"8vcpu-32gb" = {
cpu_cores = 8 - 2
memory_gibibytes = 32 - 10
gpus = 0
gpu_cluster_compatible = false
}
"16vcpu-64gb" = {
cpu_cores = 16 - 2
memory_gibibytes = 64 - 10
gpus = 0
gpu_cluster_compatible = false
}
"32vcpu-128gb" = {
cpu_cores = 32 - 2
memory_gibibytes = 128 - 10
gpus = 0
gpu_cluster_compatible = false
}
"48vcpu-192gb" = {
cpu_cores = 48 - 2
memory_gibibytes = 192 - 10
gpus = 0
gpu_cluster_compatible = false
}
"64vcpu-256gb" = {
cpu_cores = 64 - 2
memory_gibibytes = 256 - 10
gpus = 0
gpu_cluster_compatible = false
}
"80vcpu-320gb" = {
cpu_cores = 80 - 2
memory_gibibytes = 320 - 10
gpus = 0
gpu_cluster_compatible = false
}
})
"gpu-h100-sxm" = tomap({
"1gpu-16vcpu-200gb" = {
cpu_cores = 16 - 2
memory_gibibytes = 200 - 15
gpus = 1
gpu_cluster_compatible = false
}
"8gpu-128vcpu-1600gb" = {
cpu_cores = 128 - 2
memory_gibibytes = 1600 - 350
gpus = 8
gpu_cluster_compatible = true
}
})
"gpu-l40s-a" = tomap({
"1gpu-8vcpu-32gb" = {
cpu_cores = 8 - 2
memory_gibibytes = 32 - 10
gpus = 1
gpu_cluster_compatible = false
}
"1gpu-16vcpu-64gb" = {
cpu_cores = 16 - 2
memory_gibibytes = 64 - 10
gpus = 1
gpu_cluster_compatible = false
}
"1gpu-24vcpu-96gb" = {
cpu_cores = 24 - 2
memory_gibibytes = 96 - 10
gpus = 1
gpu_cluster_compatible = false
}
"1gpu-32vcpu-128gb" = {
cpu_cores = 32 - 2
memory_gibibytes = 128 - 10
gpus = 1
gpu_cluster_compatible = false
}
"1gpu-40vcpu-160gb" = {
cpu_cores = 40 - 2
memory_gibibytes = 160 - 10
gpus = 1
gpu_cluster_compatible = false
}
})
})
}
4 changes: 4 additions & 0 deletions soperator/modules/available_resources/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
output "this" {
description = "Map of available node resources grouped by platform -> preset."
value = local.resources
}
25 changes: 12 additions & 13 deletions soperator/modules/k8s/k8s_ng_gpu.tf
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
locals {
gpu = {
cluster = {
create = tomap({
"8gpu-128vcpu-1600gb" = true
"1gpu-20vcpu-200gb" = false
})[var.node_group_gpu.resource.preset]

create = module.resources.this[var.node_group_gpu.resource.platform][var.node_group_gpu.resource.preset].gpu_cluster_compatible
name = join("-", [
trimsuffix(
substr(
Expand All @@ -18,11 +14,6 @@ locals {
var.node_group_gpu.gpu_cluster.infiniband_fabric
])
}

count = tomap({
"8gpu-128vcpu-1600gb" = 8
"1gpu-20vcpu-200gb" = 1
})[var.node_group_gpu.resource.preset]
}
}

Expand Down Expand Up @@ -62,11 +53,11 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
metadata = {
labels = module.labels.label_group_name_gpu
}
taints = [{
taints = module.resources.this[var.node_group_gpu.resource.platform][var.node_group_gpu.resource.preset].gpus > 0 ? [{
key = "nvidia.com/gpu",
value = local.gpu.count
value = module.resources.this[var.node_group_gpu.resource.platform][var.node_group_gpu.resource.preset].gpus
effect = "NO_SCHEDULE"
}]
}] : null

resources = {
platform = var.node_group_gpu.resource.platform
Expand Down Expand Up @@ -105,5 +96,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
ignore_changes = [
labels,
]

precondition {
condition = (var.node_group_gpu.resource.platform == "cpu-e2"
? !contains(["2vcpu-8gb", "4vcpu-16gb"], var.node_group_gpu.resource.preset)
: true
)
error_message = "Worker resource preset '${var.node_group_gpu.resource.preset}' is insufficient."
}
}
}
4 changes: 4 additions & 0 deletions soperator/modules/k8s/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ terraform {
module "labels" {
source = "../labels"
}

module "resources" {
source = "../available_resources"
}
2 changes: 1 addition & 1 deletion soperator/modules/login/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ resource "local_file" "this" {
terraform_data.connection_ip,
]

filename = "${path.root}/login.sh"
filename = "${path.root}/${var.script_name}.sh"
file_permission = "0774"
content = templatefile("${path.module}/templates/login.sh.tftpl", {
address = terraform_data.connection_ip.output
Expand Down
6 changes: 6 additions & 0 deletions soperator/modules/login/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@ variable "slurm_cluster_name" {
type = string
nullable = false
}

variable "script_name" {
description = "Name of the script file."
type = string
default = "login"
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
clusterName: ${name}
clusterType: ${ nodes.worker.resources.gpus > 0 ? "gpu" : "cpu" }

k8sNodeFilters:
- name: ${k8s_node_filters.non_gpu.name}
Expand All @@ -22,10 +23,12 @@ k8sNodeFilters:
operator: In
values:
- ${k8s_node_filters.gpu.affinity.value}
%{~ if nodes.worker.resources.gpus > 0 ~}
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
%{~ endif ~}

volumeSources:
- name: jail
Expand Down

0 comments on commit b165086

Please sign in to comment.