diff --git a/modules/cloud-init/nfs-cloud-init.tftpl b/modules/nfs-server/files/nfs-cloud-init.tftpl similarity index 100% rename from modules/cloud-init/nfs-cloud-init.tftpl rename to modules/nfs-server/files/nfs-cloud-init.tftpl diff --git a/modules/nfs-server/main.tf b/modules/nfs-server/main.tf index 0f116fd2..233e8483 100644 --- a/modules/nfs-server/main.tf +++ b/modules/nfs-server/main.tf @@ -28,7 +28,7 @@ resource "nebius_compute_v1_instance" "nfs_server" { } ] - cloud_init_user_data = templatefile("../modules/cloud-init/nfs-cloud-init.tftpl", { + cloud_init_user_data = templatefile("${path.module}/files/nfs-cloud-init.tftpl", { ssh_user_name = var.ssh_user_name, ssh_public_key = var.ssh_public_key, nfs_ip_range = var.nfs_ip_range, diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index 59c57207..e1518263 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -68,6 +68,24 @@ module "filestore" { } } +module "nfs-server" { + count = var.nfs.enabled ? 1 : 0 + source = "../../../modules/nfs-server" + parent_id = data.nebius_iam_v1_project.this.id + subnet_id = data.nebius_vpc_v1_subnet.this.id + ssh_user_name = "soperator" + ssh_public_key = var.slurm_login_ssh_root_public_keys[0] + nfs_ip_range = data.nebius_vpc_v1_subnet.this.ipv4_private_pools.pools[0].cidrs[0].cidr + nfs_size = var.nfs.size_gibibytes * 1024 * 1024 * 1024 + nfs_path = "/mnt/nfs" + platform = var.nfs.resource.platform + preset = var.nfs.resource.preset + + providers = { + nebius = nebius + } +} + module "k8s" { depends_on = [ module.filestore, @@ -249,6 +267,13 @@ module "slurm" { } : null } + nfs = { + enabled = var.nfs.enabled + path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null + host = var.nfs.enabled ? module.nfs-server[0].nfs_server_internal_ip : null + mount_path = var.nfs.enabled ? var.nfs.mount_path : null + } + shared_memory_size_gibibytes = var.slurm_shared_memory_size_gibibytes nccl_topology_type = var.slurm_nodeset_workers[0].resource.platform == "gpu-h100-sxm" ? "H100 GPU cluster" : "auto" diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 2915f083..9b4dc51e 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -94,6 +94,20 @@ filestore_accounting = { # endregion Storage +# region nfs-server + +# nfs = { +# enabled = true +# size_gibibytes = 93 +# mount_path = "/mnt/nfs" +# resource = { +# platform = "cpu-e2" +# preset = "16vcpu-64gb" +# } +# } + +# endregion nfs-server + #----------------------------------------------------------------------------------------------------------------------# # # # # diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index b1a9a480..2714a7d1 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -127,6 +127,35 @@ variable "filestore_accounting" { # endregion Storage +# region nfs-server + +variable "nfs" { + type = object({ + enabled = bool + size_gibibytes = number + mount_path = optional(string, "/mnt/nfs") + resource = object({ + platform = string + preset = string + }) + }) + default = { + enabled = false + size_gibibytes = 93 + resource = { + platform = "cpu-e2" + preset = "16vcpu-64gb" + } + } + + validation { + condition = var.nfs.enabled ? var.nfs.size_gibibytes % 93 == 0 && var.nfs.size_gibibytes <= 262074 : true + error_message = "NFS size must be a multiple of 93 GiB and maximum value is 262074 GiB" + } +} + +# endregion nfs-server + # region k8s variable "k8s_version" { diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index d52d99e6..0dfb4deb 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -161,6 +161,8 @@ resource "helm_release" "slurm_cluster" { mount_path = submount.mount_path }] + nfs = var.nfs + nccl_topology_type = var.nccl_topology_type nccl_benchmark = { enable = var.nccl_benchmark_enable diff --git a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl index c420d949..be73ea38 100644 --- a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl @@ -108,6 +108,14 @@ volumeSources: emptyDir: sizeLimit: ${nodes.worker.resources.ephemeral_storage}Gi + %{~ if nfs.enabled ~} + - name: nfs + nfs: + path: ${nfs.path} + readOnly: false + server: ${nfs.host} + %{~ endif ~} + %{~ for sub_mount in jail_submounts ~} - name: jail-submount-${sub_mount.name} persistentVolumeClaim: @@ -213,8 +221,13 @@ slurmNodes: spool: volumeClaimTemplateSpec: null volumeSourceName: worker-spool - %{~ if length(jail_submounts) > 0 ~} + %{~ if length(jail_submounts) > 0 || nfs.enabled ~} jailSubMounts: + %{~ if nfs.enabled ~} + - mountPath: ${nfs.mount_path} + name: nfs + volumeSourceName: nfs + %{~ endif ~} %{~ for sub_mount in jail_submounts ~} - name: ${sub_mount.name} mountPath: ${sub_mount.mount_path} @@ -240,7 +253,6 @@ slurmNodes: - ${key} %{~ endfor ~} %{~ endif ~} - %{~ if length(jail_submounts) > 0 ~} sshd: resources: cpu: ${nodes.login.resources.cpu * 1000}m @@ -251,8 +263,14 @@ slurmNodes: cpu: ${nodes.munge.resources.cpu * 1000}m memory: ${nodes.munge.resources.memory}Gi ephemeralStorage: ${nodes.munge.resources.ephemeral_storage}Gi + %{~ if length(jail_submounts) > 0 || nfs.enabled ~} volumes: jailSubMounts: + %{~ if nfs.enabled ~} + - mountPath: ${nfs.mount_path} + name: nfs + volumeSourceName: nfs + %{~ endif ~} %{~ for sub_mount in jail_submounts ~} - name: ${sub_mount.name} mountPath: ${sub_mount.mount_path} diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index c9db7d15..ab13f4f6 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -171,6 +171,27 @@ variable "filestores" { # endregion Filestore +# region nfs-server + +variable "nfs" { + type = object({ + enabled = bool + mount_path = optional(string, "/mnt/nfs") + path = optional(string) + host = optional(string) + }) + default = { + enabled = false + } + + validation { + condition = var.nfs.enabled ? var.nfs.path != null && var.nfs.host != null : true + error_message = "NFS path and host must be set." + } +} + +# endregion nfs-server + # region Config variable "shared_memory_size_gibibytes" {