Skip to content

Commit

Permalink
Added support for NFS server in Soperator (#112)
Browse files Browse the repository at this point in the history
* Added support for NFS server in Soperator

* Move nfs cloud-init into module folder

* Added variables for nfs server presets in soperator

* Customize mount path for nfs server
  • Loading branch information
d3vil-st authored Dec 19, 2024
1 parent 772c7e3 commit 95422b4
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 3 deletions.
File renamed without changes.
2 changes: 1 addition & 1 deletion modules/nfs-server/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ resource "nebius_compute_v1_instance" "nfs_server" {
}
]

cloud_init_user_data = templatefile("../modules/cloud-init/nfs-cloud-init.tftpl", {
cloud_init_user_data = templatefile("${path.module}/files/nfs-cloud-init.tftpl", {
ssh_user_name = var.ssh_user_name,
ssh_public_key = var.ssh_public_key,
nfs_ip_range = var.nfs_ip_range,
Expand Down
25 changes: 25 additions & 0 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@ module "filestore" {
}
}

module "nfs-server" {
count = var.nfs.enabled ? 1 : 0
source = "../../../modules/nfs-server"
parent_id = data.nebius_iam_v1_project.this.id
subnet_id = data.nebius_vpc_v1_subnet.this.id
ssh_user_name = "soperator"
ssh_public_key = var.slurm_login_ssh_root_public_keys[0]
nfs_ip_range = data.nebius_vpc_v1_subnet.this.ipv4_private_pools.pools[0].cidrs[0].cidr
nfs_size = var.nfs.size_gibibytes * 1024 * 1024 * 1024
nfs_path = "/mnt/nfs"
platform = var.nfs.resource.platform
preset = var.nfs.resource.preset

providers = {
nebius = nebius
}
}

module "k8s" {
depends_on = [
module.filestore,
Expand Down Expand Up @@ -249,6 +267,13 @@ module "slurm" {
} : null
}

nfs = {
enabled = var.nfs.enabled
path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null
host = var.nfs.enabled ? module.nfs-server[0].nfs_server_internal_ip : null
mount_path = var.nfs.enabled ? var.nfs.mount_path : null
}

shared_memory_size_gibibytes = var.slurm_shared_memory_size_gibibytes

nccl_topology_type = var.slurm_nodeset_workers[0].resource.platform == "gpu-h100-sxm" ? "H100 GPU cluster" : "auto"
Expand Down
14 changes: 14 additions & 0 deletions soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,20 @@ filestore_accounting = {

# endregion Storage

# region nfs-server

# nfs = {
# enabled = true
# size_gibibytes = 93
# mount_path = "/mnt/nfs"
# resource = {
# platform = "cpu-e2"
# preset = "16vcpu-64gb"
# }
# }

# endregion nfs-server

#----------------------------------------------------------------------------------------------------------------------#
# #
# #
Expand Down
29 changes: 29 additions & 0 deletions soperator/installations/example/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,35 @@ variable "filestore_accounting" {

# endregion Storage

# region nfs-server

variable "nfs" {
type = object({
enabled = bool
size_gibibytes = number
mount_path = optional(string, "/mnt/nfs")
resource = object({
platform = string
preset = string
})
})
default = {
enabled = false
size_gibibytes = 93
resource = {
platform = "cpu-e2"
preset = "16vcpu-64gb"
}
}

validation {
condition = var.nfs.enabled ? var.nfs.size_gibibytes % 93 == 0 && var.nfs.size_gibibytes <= 262074 : true
error_message = "NFS size must be a multiple of 93 GiB and maximum value is 262074 GiB"
}
}

# endregion nfs-server

# region k8s

variable "k8s_version" {
Expand Down
2 changes: 2 additions & 0 deletions soperator/modules/slurm/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ resource "helm_release" "slurm_cluster" {
mount_path = submount.mount_path
}]

nfs = var.nfs

nccl_topology_type = var.nccl_topology_type
nccl_benchmark = {
enable = var.nccl_benchmark_enable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,14 @@ volumeSources:
emptyDir:
sizeLimit: ${nodes.worker.resources.ephemeral_storage}Gi

%{~ if nfs.enabled ~}
- name: nfs
nfs:
path: ${nfs.path}
readOnly: false
server: ${nfs.host}
%{~ endif ~}

%{~ for sub_mount in jail_submounts ~}
- name: jail-submount-${sub_mount.name}
persistentVolumeClaim:
Expand Down Expand Up @@ -214,8 +222,13 @@ slurmNodes:
spool:
volumeClaimTemplateSpec: null
volumeSourceName: worker-spool
%{~ if length(jail_submounts) > 0 ~}
%{~ if length(jail_submounts) > 0 || nfs.enabled ~}
jailSubMounts:
%{~ if nfs.enabled ~}
- mountPath: ${nfs.mount_path}
name: nfs
volumeSourceName: nfs
%{~ endif ~}
%{~ for sub_mount in jail_submounts ~}
- name: ${sub_mount.name}
mountPath: ${sub_mount.mount_path}
Expand All @@ -241,7 +254,6 @@ slurmNodes:
- ${key}
%{~ endfor ~}
%{~ endif ~}
%{~ if length(jail_submounts) > 0 ~}
sshd:
resources:
cpu: ${nodes.login.resources.cpu * 1000}m
Expand All @@ -252,8 +264,14 @@ slurmNodes:
cpu: ${nodes.munge.resources.cpu * 1000}m
memory: ${nodes.munge.resources.memory}Gi
ephemeralStorage: ${nodes.munge.resources.ephemeral_storage}Gi
%{~ if length(jail_submounts) > 0 || nfs.enabled ~}
volumes:
jailSubMounts:
%{~ if nfs.enabled ~}
- mountPath: ${nfs.mount_path}
name: nfs
volumeSourceName: nfs
%{~ endif ~}
%{~ for sub_mount in jail_submounts ~}
- name: ${sub_mount.name}
mountPath: ${sub_mount.mount_path}
Expand Down
21 changes: 21 additions & 0 deletions soperator/modules/slurm/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,27 @@ variable "filestores" {

# endregion Filestore

# region nfs-server

variable "nfs" {
type = object({
enabled = bool
mount_path = optional(string, "/mnt/nfs")
path = optional(string)
host = optional(string)
})
default = {
enabled = false
}

validation {
condition = var.nfs.enabled ? var.nfs.path != null && var.nfs.host != null : true
error_message = "NFS path and host must be set."
}
}

# endregion nfs-server

# region Config

variable "shared_memory_size_gibibytes" {
Expand Down

0 comments on commit 95422b4

Please sign in to comment.