From fcb3e1396f6fba7740bd1f5b3d12cead3ce693e3 Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Tue, 10 Dec 2024 10:46:24 +0100 Subject: [PATCH 1/4] Added support for NFS server in Soperator --- soperator/installations/example/main.tf | 24 +++++++++++++++++++ .../installations/example/terraform.tfvars | 9 +++++++ soperator/installations/example/variables.tf | 20 ++++++++++++++++ soperator/modules/slurm/main.tf | 2 ++ .../helm_values/slurm_cluster.yaml.tftpl | 15 +++++++++++- soperator/modules/slurm/variables.tf | 20 ++++++++++++++++ 6 files changed, 89 insertions(+), 1 deletion(-) diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index 59c57207..ec2eb8ff 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -68,6 +68,24 @@ module "filestore" { } } +module "nfs-server" { + count = var.nfs.enabled ? 1 : 0 + source = "../../../modules/nfs-server" + parent_id = data.nebius_iam_v1_project.this.id + subnet_id = data.nebius_vpc_v1_subnet.this.id + ssh_user_name = "soperator" + ssh_public_key = var.slurm_login_ssh_root_public_keys[0] + nfs_ip_range = data.nebius_vpc_v1_subnet.this.ipv4_private_pools.pools[0].cidrs[0].cidr + nfs_size = var.nfs.size_gibibytes * 1024 * 1024 * 1024 + nfs_path = "/mnt/nfs" + platform = "cpu-e2" + preset = "16vcpu-64gb" + + providers = { + nebius = nebius + } +} + module "k8s" { depends_on = [ module.filestore, @@ -249,6 +267,12 @@ module "slurm" { } : null } + nfs = { + enabled = var.nfs.enabled + path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null + host = var.nfs.enabled ? module.nfs-server[0].nfs_server_internal_ip : null + } + shared_memory_size_gibibytes = var.slurm_shared_memory_size_gibibytes nccl_topology_type = var.slurm_nodeset_workers[0].resource.platform == "gpu-h100-sxm" ? "H100 GPU cluster" : "auto" diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 2915f083..0fd8cb78 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -94,6 +94,15 @@ filestore_accounting = { # endregion Storage +# region nfs-server + +# nfs = { +# enabled = true +# size_gibibytes = 93 +# } + +# endregion nfs-server + #----------------------------------------------------------------------------------------------------------------------# # # # # diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index b1a9a480..1b475745 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -127,6 +127,26 @@ variable "filestore_accounting" { # endregion Storage +# region nfs-server + +variable "nfs" { + type = object({ + enabled = bool + size_gibibytes = number + }) + default = { + enabled = false + size_gibibytes = 93 + } + + validation { + condition = var.nfs.enabled ? var.nfs.size_gibibytes % 93 == 0 && var.nfs.size_gibibytes <= 262074 : true + error_message = "NFS size must be a multiple of 93 GiB and maximum value is 262074 GiB" + } +} + +# endregion nfs-server + # region k8s variable "k8s_version" { diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index d52d99e6..0dfb4deb 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -161,6 +161,8 @@ resource "helm_release" "slurm_cluster" { mount_path = submount.mount_path }] + nfs = var.nfs + nccl_topology_type = var.nccl_topology_type nccl_benchmark = { enable = var.nccl_benchmark_enable diff --git a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl index c420d949..ff942779 100644 --- a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl @@ -108,6 +108,14 @@ volumeSources: emptyDir: sizeLimit: ${nodes.worker.resources.ephemeral_storage}Gi + %{~ if nfs.enabled ~} + - name: nfs + nfs: + path: ${nfs.path} + readOnly: false + server: ${nfs.host} + %{~ endif ~} + %{~ for sub_mount in jail_submounts ~} - name: jail-submount-${sub_mount.name} persistentVolumeClaim: @@ -213,8 +221,13 @@ slurmNodes: spool: volumeClaimTemplateSpec: null volumeSourceName: worker-spool - %{~ if length(jail_submounts) > 0 ~} + %{~ if length(jail_submounts) > 0 || nfs.enabled ~} jailSubMounts: + %{~ if nfs.enabled ~} + - mountPath: /nfs + name: nfs + volumeSourceName: nfs + %{~ endif ~} %{~ for sub_mount in jail_submounts ~} - name: ${sub_mount.name} mountPath: ${sub_mount.mount_path} diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index c9db7d15..13c024b3 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -171,6 +171,26 @@ variable "filestores" { # endregion Filestore +# region nfs-server + +variable "nfs" { + type = object({ + enabled = bool + path = optional(string) + host = optional(string) + }) + default = { + enabled = false + } + + validation { + condition = var.nfs.enabled ? var.nfs.path != null && var.nfs.host != null : true + error_message = "NFS path and host must be set." + } +} + +# endregion nfs-server + # region Config variable "shared_memory_size_gibibytes" { From 4c4d008d59238691d78278928956f8f819131482 Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Tue, 10 Dec 2024 23:09:48 +0100 Subject: [PATCH 2/4] Move nfs cloud-init into module folder --- modules/{cloud-init => nfs-server/files}/nfs-cloud-init.tftpl | 0 modules/nfs-server/main.tf | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename modules/{cloud-init => nfs-server/files}/nfs-cloud-init.tftpl (100%) diff --git a/modules/cloud-init/nfs-cloud-init.tftpl b/modules/nfs-server/files/nfs-cloud-init.tftpl similarity index 100% rename from modules/cloud-init/nfs-cloud-init.tftpl rename to modules/nfs-server/files/nfs-cloud-init.tftpl diff --git a/modules/nfs-server/main.tf b/modules/nfs-server/main.tf index 0f116fd2..233e8483 100644 --- a/modules/nfs-server/main.tf +++ b/modules/nfs-server/main.tf @@ -28,7 +28,7 @@ resource "nebius_compute_v1_instance" "nfs_server" { } ] - cloud_init_user_data = templatefile("../modules/cloud-init/nfs-cloud-init.tftpl", { + cloud_init_user_data = templatefile("${path.module}/files/nfs-cloud-init.tftpl", { ssh_user_name = var.ssh_user_name, ssh_public_key = var.ssh_public_key, nfs_ip_range = var.nfs_ip_range, From b7ed082dfbc1cbe6c089bc9b52141188f5c7f813 Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Tue, 10 Dec 2024 23:11:34 +0100 Subject: [PATCH 3/4] Added variables for nfs server presets in soperator --- soperator/installations/example/main.tf | 4 ++-- soperator/installations/example/terraform.tfvars | 4 ++++ soperator/installations/example/variables.tf | 14 +++++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index ec2eb8ff..39689e9d 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -78,8 +78,8 @@ module "nfs-server" { nfs_ip_range = data.nebius_vpc_v1_subnet.this.ipv4_private_pools.pools[0].cidrs[0].cidr nfs_size = var.nfs.size_gibibytes * 1024 * 1024 * 1024 nfs_path = "/mnt/nfs" - platform = "cpu-e2" - preset = "16vcpu-64gb" + platform = var.nfs.resource.platform + preset = var.nfs.resource.preset providers = { nebius = nebius diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 0fd8cb78..6b465149 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -99,6 +99,10 @@ filestore_accounting = { # nfs = { # enabled = true # size_gibibytes = 93 +# resource = { +# platform = "cpu-e2" +# preset = "16vcpu-64gb" +# } # } # endregion nfs-server diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index 1b475745..efab82ba 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -131,16 +131,24 @@ variable "filestore_accounting" { variable "nfs" { type = object({ - enabled = bool + enabled = bool size_gibibytes = number + resource = object({ + platform = string + preset = string + }) }) default = { - enabled = false + enabled = false size_gibibytes = 93 + resource = { + platform = "cpu-e2" + preset = "16vcpu-64gb" + } } validation { - condition = var.nfs.enabled ? var.nfs.size_gibibytes % 93 == 0 && var.nfs.size_gibibytes <= 262074 : true + condition = var.nfs.enabled ? var.nfs.size_gibibytes % 93 == 0 && var.nfs.size_gibibytes <= 262074 : true error_message = "NFS size must be a multiple of 93 GiB and maximum value is 262074 GiB" } } From 6a37ba471c1428ef15bcacfe23077f31454cbf4e Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 19 Dec 2024 14:33:31 +0100 Subject: [PATCH 4/4] Customize mount path for nfs server --- soperator/installations/example/main.tf | 7 ++++--- soperator/installations/example/terraform.tfvars | 1 + soperator/installations/example/variables.tf | 1 + .../slurm/templates/helm_values/slurm_cluster.yaml.tftpl | 9 +++++++-- soperator/modules/slurm/variables.tf | 7 ++++--- 5 files changed, 17 insertions(+), 8 deletions(-) diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index 39689e9d..e1518263 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -268,9 +268,10 @@ module "slurm" { } nfs = { - enabled = var.nfs.enabled - path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null - host = var.nfs.enabled ? module.nfs-server[0].nfs_server_internal_ip : null + enabled = var.nfs.enabled + path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null + host = var.nfs.enabled ? module.nfs-server[0].nfs_server_internal_ip : null + mount_path = var.nfs.enabled ? var.nfs.mount_path : null } shared_memory_size_gibibytes = var.slurm_shared_memory_size_gibibytes diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 6b465149..9b4dc51e 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -99,6 +99,7 @@ filestore_accounting = { # nfs = { # enabled = true # size_gibibytes = 93 +# mount_path = "/mnt/nfs" # resource = { # platform = "cpu-e2" # preset = "16vcpu-64gb" diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index efab82ba..2714a7d1 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -133,6 +133,7 @@ variable "nfs" { type = object({ enabled = bool size_gibibytes = number + mount_path = optional(string, "/mnt/nfs") resource = object({ platform = string preset = string diff --git a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl index ff942779..be73ea38 100644 --- a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl @@ -224,7 +224,7 @@ slurmNodes: %{~ if length(jail_submounts) > 0 || nfs.enabled ~} jailSubMounts: %{~ if nfs.enabled ~} - - mountPath: /nfs + - mountPath: ${nfs.mount_path} name: nfs volumeSourceName: nfs %{~ endif ~} @@ -253,7 +253,6 @@ slurmNodes: - ${key} %{~ endfor ~} %{~ endif ~} - %{~ if length(jail_submounts) > 0 ~} sshd: resources: cpu: ${nodes.login.resources.cpu * 1000}m @@ -264,8 +263,14 @@ slurmNodes: cpu: ${nodes.munge.resources.cpu * 1000}m memory: ${nodes.munge.resources.memory}Gi ephemeralStorage: ${nodes.munge.resources.ephemeral_storage}Gi + %{~ if length(jail_submounts) > 0 || nfs.enabled ~} volumes: jailSubMounts: + %{~ if nfs.enabled ~} + - mountPath: ${nfs.mount_path} + name: nfs + volumeSourceName: nfs + %{~ endif ~} %{~ for sub_mount in jail_submounts ~} - name: ${sub_mount.name} mountPath: ${sub_mount.mount_path} diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index 13c024b3..ab13f4f6 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -175,9 +175,10 @@ variable "filestores" { variable "nfs" { type = object({ - enabled = bool - path = optional(string) - host = optional(string) + enabled = bool + mount_path = optional(string, "/mnt/nfs") + path = optional(string) + host = optional(string) }) default = { enabled = false