From 0995f566ce9564479ad1be33eda95cf2c4513ece Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Wed, 27 Nov 2024 17:48:21 +0100 Subject: [PATCH 1/4] Change defaults --- soperator/installations/example/.envrc | 2 + .../installations/example/terraform.tfvars | 101 +++++++++--------- 2 files changed, 53 insertions(+), 50 deletions(-) diff --git a/soperator/installations/example/.envrc b/soperator/installations/example/.envrc index bbc24d82..f78439fa 100644 --- a/soperator/installations/example/.envrc +++ b/soperator/installations/example/.envrc @@ -127,12 +127,14 @@ else echo "Using existing bucket: ${NEBIUS_BUCKET_NAME}" fi +export TFE_PARALLELISM=20 # print all exported variables echo "Exported variables:" echo "NEBIUS_TENANT_ID: ${NEBIUS_TENANT_ID}" echo "NEBIUS_PROJECT_ID: ${NEBIUS_PROJECT_ID}" echo "NEBIUS_BUCKET_NAME: ${NEBIUS_BUCKET_NAME}" +echo "TFE_PARALLELISM: ${TFE_PARALLELISM}" cat > terraform_backend_override.tf << EOF terraform { diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index f5d53708..084f9ef8 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -6,35 +6,6 @@ # # #----------------------------------------------------------------------------------------------------------------------# -#----------------------------------------------------------------------------------------------------------------------# -# # -# # -# Cloud # -# # -# # -#----------------------------------------------------------------------------------------------------------------------# -# region Cloud - -# IAM token used for communicating with Nebius services. -# Token is being passed via .envrc file. -# Uncomment to override. -# --- -# iam_token = "" - -# ID of the IAM project. -# Project ID is being passed via .envrc file. -# Uncomment to override. -# --- -# iam_project_id = "project-" - -# ID of VPC subnet. -# Subnet ID is being passed via .envrc file. -# Uncomment to override. -# --- -#vpc_subnet_id = "vpcsubnet-" - -# endregion Cloud - #----------------------------------------------------------------------------------------------------------------------# # # # # @@ -132,11 +103,11 @@ filestore_accounting = { # Version of the k8s to be used. # --- -# k8s_version = "1.30" +k8s_version = "1.30" # Name of the k8s cluster. # --- -k8s_cluster_name = "slurm-k8s" +k8s_cluster_name = "soperator" # SSH user credentials for accessing k8s nodes. # By default, empty list. @@ -151,6 +122,35 @@ k8s_cluster_name = "slurm-k8s" # endregion k8s +#----------------------------------------------------------------------------------------------------------------------# +# # +# # +# Cloud # +# # +# # +#----------------------------------------------------------------------------------------------------------------------# +# region Cloud + +# IAM token used for communicating with Nebius services. +# Token is being passed via .envrc file. +# Uncomment to override. +# --- +# iam_token = "" + +# ID of the IAM project. +# Project ID is being passed via .envrc file. +# Uncomment to override. +# --- +# iam_project_id = "project-" + +# ID of VPC subnet. +# Subnet ID is being passed via .envrc file. +# Uncomment to override. +# --- +#vpc_subnet_id = "vpcsubnet-" + +# endregion Cloud + # endregion Infrastructure #----------------------------------------------------------------------------------------------------------------------# @@ -164,7 +164,7 @@ k8s_cluster_name = "slurm-k8s" # Name of the Slurm cluster in k8s cluster. # --- -slurm_cluster_name = "my-amazing-slurm" +slurm_cluster_name = "soperator" # Version of soperator. # --- @@ -173,7 +173,7 @@ slurm_operator_version = "1.15.4" # Type of the Slurm partition config. Could be either `default` or `custom`. # By default, "default". # --- -# slurm_partition_config_type = "custom" +slurm_partition_config_type = "default" # Partition config in case of `custom` slurm_partition_config_type. # Each string must be started with `PartitionName`. @@ -209,10 +209,10 @@ slurm_nodeset_system = { # Configuration of Slurm Controller node set. # --- slurm_nodeset_controller = { - size = 2 + size = 1 resource = { platform = "cpu-e2" - preset = "16vcpu-64gb" + preset = "8vcpu-32gb" } boot_disk = { type = "NETWORK_SSD" @@ -225,10 +225,11 @@ slurm_nodeset_controller = { # There can be only one Worker node set for a while. # Split factor allows you to split node set into equally-sized node groups to keep your cluster accessible and working # during maintenance. +# infiniband_fabric is required field # --- slurm_nodeset_workers = [{ - size = 2 - split_factor = 2 + size = 16 + split_factor = 4 max_unavailable_percent = 50 resource = { platform = "gpu-h100-sxm" @@ -236,11 +237,11 @@ slurm_nodeset_workers = [{ } boot_disk = { type = "NETWORK_SSD" - size_gibibytes = 1024 - block_size_kibibytes = 32 + size_gibibytes = 256 + block_size_kibibytes = 4 } gpu_cluster = { - infiniband_fabric = "fabric-3" + infiniband_fabric = "" } }] @@ -250,7 +251,7 @@ slurm_nodeset_login = { size = 1 resource = { platform = "cpu-e2" - preset = "16vcpu-64gb" + preset = "32vcpu-128gb" } boot_disk = { type = "NETWORK_SSD" @@ -306,7 +307,7 @@ slurm_login_ssh_root_public_keys = [ # Whether to enable Slurm metrics exporter. # By default, true. # --- -# slurm_exporter_enabled = false +slurm_exporter_enabled = true # endregion Exporter @@ -318,7 +319,7 @@ slurm_login_ssh_root_public_keys = [ # Whether to enable Slurm REST API. # By default, false. # --- -# slurm_rest_enabled = false +slurm_rest_enabled = false # endregion REST API @@ -334,7 +335,7 @@ slurm_login_ssh_root_public_keys = [ # Shared memory size for Slurm controller and worker nodes in GiB. # By default, 64. # --- -slurm_shared_memory_size_gibibytes = 256 +slurm_shared_memory_size_gibibytes = 384 # endregion Config @@ -349,22 +350,22 @@ slurm_shared_memory_size_gibibytes = 256 # It won't take effect in case of 1-GPU hosts. # By default, true. # --- -# nccl_benchmark_enable = false +nccl_benchmark_enable = true # NCCL benchmark's CronJob schedule. # By default, `0 */3 * * *` - every 3 hour. # --- -# nccl_benchmark_enable = "0 */3 * * *" +nccl_benchmark_schedule = "0 */3 * * *" # Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable. # By default, 45. # --- -# nccl_benchmark_min_threshold = 45 +nccl_benchmark_min_threshold = 45 # Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test. # By default, true # --- -# nccl_use_infiniband = true +nccl_use_infiniband = true # endregion NCCL benchmark @@ -378,12 +379,12 @@ slurm_shared_memory_size_gibibytes = 256 # Whether to enable telemetry. # By default, true. # --- -# telemetry_enabled = false +telemetry_enabled = true # Password of `admin` user of Grafana. # Set it to your desired password. # --- -telemetry_grafana_admin_password = "" +telemetry_grafana_admin_password = "password" # endregion Telemetry From 2d4a06707ffb15380018f94a92f2499b97185d09 Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Thu, 28 Nov 2024 12:17:17 +0100 Subject: [PATCH 2/4] change more defaults --- soperator/installations/example/terraform.tfvars | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 084f9ef8..d5d36685 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -54,11 +54,11 @@ filestore_jail = { } } -# Shared filesystems to be mounted inside jail. +# Additional (Optional) shared filesystems to be mounted inside jail. # --- # filestore_jail_submounts = [{ -# name = "mlperf-sd" -# mount_path = "/mlperf-sd" +# name = "datasets" +# mount_path = "/datasets" # spec = { # size_gibibytes = 2048 # block_size_kibibytes = 4 @@ -67,8 +67,8 @@ filestore_jail = { # Or use existing filestores. # --- # filestore_jail_submounts = [{ -# name = "mlperf-sd" -# mount_path = "/mlperf-sd" +# name = "datasets" +# mount_path = "/datasets" # existing = { # id = "computefilesystem-" # } @@ -110,6 +110,7 @@ k8s_version = "1.30" k8s_cluster_name = "soperator" # SSH user credentials for accessing k8s nodes. +# That option add public ip address to every node. # By default, empty list. # --- # k8s_cluster_node_ssh_access_users = [{ @@ -224,7 +225,7 @@ slurm_nodeset_controller = { # Configuration of Slurm Worker node sets. # There can be only one Worker node set for a while. # Split factor allows you to split node set into equally-sized node groups to keep your cluster accessible and working -# during maintenance. +# during maintenance. Example: split_factor 3 for 12 nodes will create for you 3 groups with 4 nodes in every group. # infiniband_fabric is required field # --- slurm_nodeset_workers = [{ @@ -294,7 +295,7 @@ slurm_login_service_type = "LoadBalancer" # Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user. # --- slurm_login_ssh_root_public_keys = [ - "", + "", ] # endregion Login From 6b0db363482b15a29ae75352f08c44af653fbfb7 Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Tue, 3 Dec 2024 10:41:54 +0100 Subject: [PATCH 3/4] mv k8s setting to the bottom --- .../installations/example/terraform.tfvars | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index d5d36685..cae15a21 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -94,35 +94,6 @@ filestore_accounting = { # endregion Storage -#----------------------------------------------------------------------------------------------------------------------# -# # -# Kubernetes # -# # -#----------------------------------------------------------------------------------------------------------------------# -# region k8s - -# Version of the k8s to be used. -# --- -k8s_version = "1.30" - -# Name of the k8s cluster. -# --- -k8s_cluster_name = "soperator" - -# SSH user credentials for accessing k8s nodes. -# That option add public ip address to every node. -# By default, empty list. -# --- -# k8s_cluster_node_ssh_access_users = [{ -# name = "" -# public_keys = [ -# "", -# "", -# ] -# }] - -# endregion k8s - #----------------------------------------------------------------------------------------------------------------------# # # # # @@ -404,3 +375,32 @@ accounting_enabled = true # endregion Accounting # endregion Slurm + +#----------------------------------------------------------------------------------------------------------------------# +# # +# Kubernetes # +# # +#----------------------------------------------------------------------------------------------------------------------# +# region k8s + +# Version of the k8s to be used. +# --- +k8s_version = "1.30" + +# Name of the k8s cluster. +# --- +k8s_cluster_name = "soperator" + +# SSH user credentials for accessing k8s nodes. +# That option add public ip address to every node. +# By default, empty list. +# --- +# k8s_cluster_node_ssh_access_users = [{ +# name = "" +# public_keys = [ +# "", +# "", +# ] +# }] + +# endregion k8s From 568f9819ba001507ecd99c5325e50446801b21ec Mon Sep 17 00:00:00 2001 From: Uburro Date: Tue, 3 Dec 2024 11:30:39 +0100 Subject: [PATCH 4/4] bump soperator to 1.15.5 --- soperator/VERSION | 2 +- soperator/installations/example/terraform.tfvars | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/soperator/VERSION b/soperator/VERSION index e34208c9..d3243490 100644 --- a/soperator/VERSION +++ b/soperator/VERSION @@ -1 +1 @@ -1.15.4 +1.15.5 diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index cae15a21..2915f083 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -140,7 +140,7 @@ slurm_cluster_name = "soperator" # Version of soperator. # --- -slurm_operator_version = "1.15.4" +slurm_operator_version = "1.15.5" # Type of the Slurm partition config. Could be either `default` or `custom`. # By default, "default".