Skip to content

Commit

Permalink
Merge pull request #105 from nebius/release/soperator
Browse files Browse the repository at this point in the history
Release soperator 1.15.5
  • Loading branch information
Uburro authored Dec 3, 2024
2 parents 98aea57 + 5d9af09 commit 8d20522
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 73 deletions.
2 changes: 1 addition & 1 deletion soperator/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.15.4
1.15.5
2 changes: 2 additions & 0 deletions soperator/installations/example/.envrc
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,14 @@ else
echo "Using existing bucket: ${NEBIUS_BUCKET_NAME}"
fi

export TFE_PARALLELISM=20

# print all exported variables
echo "Exported variables:"
echo "NEBIUS_TENANT_ID: ${NEBIUS_TENANT_ID}"
echo "NEBIUS_PROJECT_ID: ${NEBIUS_PROJECT_ID}"
echo "NEBIUS_BUCKET_NAME: ${NEBIUS_BUCKET_NAME}"
echo "TFE_PARALLELISM: ${TFE_PARALLELISM}"

cat > terraform_backend_override.tf << EOF
terraform {
Expand Down
146 changes: 74 additions & 72 deletions soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,6 @@
# #
#----------------------------------------------------------------------------------------------------------------------#

#----------------------------------------------------------------------------------------------------------------------#
# #
# #
# Cloud #
# #
# #
#----------------------------------------------------------------------------------------------------------------------#
# region Cloud

# IAM token used for communicating with Nebius services.
# Token is being passed via .envrc file.
# Uncomment to override.
# ---
# iam_token = "<YOUR-IAM-TOKEN>"

# ID of the IAM project.
# Project ID is being passed via .envrc file.
# Uncomment to override.
# ---
# iam_project_id = "project-<YOUR-PROJECT-ID>"

# ID of VPC subnet.
# Subnet ID is being passed via .envrc file.
# Uncomment to override.
# ---
#vpc_subnet_id = "vpcsubnet-<YOUR-SUBNET-ID>"

# endregion Cloud

#----------------------------------------------------------------------------------------------------------------------#
# #
# #
Expand Down Expand Up @@ -83,11 +54,11 @@ filestore_jail = {
}
}

# Shared filesystems to be mounted inside jail.
# Additional (Optional) shared filesystems to be mounted inside jail.
# ---
# filestore_jail_submounts = [{
# name = "mlperf-sd"
# mount_path = "/mlperf-sd"
# name = "datasets"
# mount_path = "/datasets"
# spec = {
# size_gibibytes = 2048
# block_size_kibibytes = 4
Expand All @@ -96,8 +67,8 @@ filestore_jail = {
# Or use existing filestores.
# ---
# filestore_jail_submounts = [{
# name = "mlperf-sd"
# mount_path = "/mlperf-sd"
# name = "datasets"
# mount_path = "/datasets"
# existing = {
# id = "computefilesystem-<YOUR-FILESTORE-ID>"
# }
Expand Down Expand Up @@ -125,31 +96,32 @@ filestore_accounting = {

#----------------------------------------------------------------------------------------------------------------------#
# #
# Kubernetes #
# #
# Cloud #
# #
# #
#----------------------------------------------------------------------------------------------------------------------#
# region k8s
# region Cloud

# Version of the k8s to be used.
# IAM token used for communicating with Nebius services.
# Token is being passed via .envrc file.
# Uncomment to override.
# ---
# k8s_version = "1.30"
# iam_token = "<YOUR-IAM-TOKEN>"

# Name of the k8s cluster.
# ID of the IAM project.
# Project ID is being passed via .envrc file.
# Uncomment to override.
# ---
k8s_cluster_name = "slurm-k8s"
# iam_project_id = "project-<YOUR-PROJECT-ID>"

# SSH user credentials for accessing k8s nodes.
# By default, empty list.
# ID of VPC subnet.
# Subnet ID is being passed via .envrc file.
# Uncomment to override.
# ---
# k8s_cluster_node_ssh_access_users = [{
# name = "<USER1>"
# public_keys = [
# "<ENCRYPTION-METHOD1 HASH1 USER1>",
# "<ENCRYPTION-METHOD2 HASH2 USER1>",
# ]
# }]
#vpc_subnet_id = "vpcsubnet-<YOUR-SUBNET-ID>"

# endregion k8s
# endregion Cloud

# endregion Infrastructure

Expand All @@ -164,16 +136,16 @@ k8s_cluster_name = "slurm-k8s"

# Name of the Slurm cluster in k8s cluster.
# ---
slurm_cluster_name = "my-amazing-slurm"
slurm_cluster_name = "soperator"

# Version of soperator.
# ---
slurm_operator_version = "1.15.4"
slurm_operator_version = "1.15.5"

# Type of the Slurm partition config. Could be either `default` or `custom`.
# By default, "default".
# ---
# slurm_partition_config_type = "custom"
slurm_partition_config_type = "default"

# Partition config in case of `custom` slurm_partition_config_type.
# Each string must be started with `PartitionName`.
Expand Down Expand Up @@ -209,10 +181,10 @@ slurm_nodeset_system = {
# Configuration of Slurm Controller node set.
# ---
slurm_nodeset_controller = {
size = 2
size = 1
resource = {
platform = "cpu-e2"
preset = "16vcpu-64gb"
preset = "8vcpu-32gb"
}
boot_disk = {
type = "NETWORK_SSD"
Expand All @@ -224,23 +196,24 @@ slurm_nodeset_controller = {
# Configuration of Slurm Worker node sets.
# There can be only one Worker node set for a while.
# Split factor allows you to split node set into equally-sized node groups to keep your cluster accessible and working
# during maintenance.
# during maintenance. Example: split_factor 3 for 12 nodes will create for you 3 groups with 4 nodes in every group.
# infiniband_fabric is required field
# ---
slurm_nodeset_workers = [{
size = 2
split_factor = 2
size = 16
split_factor = 4
max_unavailable_percent = 50
resource = {
platform = "gpu-h100-sxm"
preset = "8gpu-128vcpu-1600gb"
}
boot_disk = {
type = "NETWORK_SSD"
size_gibibytes = 1024
block_size_kibibytes = 32
size_gibibytes = 256
block_size_kibibytes = 4
}
gpu_cluster = {
infiniband_fabric = "fabric-3"
infiniband_fabric = ""
}
}]

Expand All @@ -250,7 +223,7 @@ slurm_nodeset_login = {
size = 1
resource = {
platform = "cpu-e2"
preset = "16vcpu-64gb"
preset = "32vcpu-128gb"
}
boot_disk = {
type = "NETWORK_SSD"
Expand Down Expand Up @@ -293,7 +266,7 @@ slurm_login_service_type = "LoadBalancer"
# Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user.
# ---
slurm_login_ssh_root_public_keys = [
"<ENCRYPTION-METHOD HASH USER>",
"",
]

# endregion Login
Expand All @@ -306,7 +279,7 @@ slurm_login_ssh_root_public_keys = [
# Whether to enable Slurm metrics exporter.
# By default, true.
# ---
# slurm_exporter_enabled = false
slurm_exporter_enabled = true

# endregion Exporter

Expand All @@ -318,7 +291,7 @@ slurm_login_ssh_root_public_keys = [
# Whether to enable Slurm REST API.
# By default, false.
# ---
# slurm_rest_enabled = false
slurm_rest_enabled = false

# endregion REST API

Expand All @@ -334,7 +307,7 @@ slurm_login_ssh_root_public_keys = [
# Shared memory size for Slurm controller and worker nodes in GiB.
# By default, 64.
# ---
slurm_shared_memory_size_gibibytes = 256
slurm_shared_memory_size_gibibytes = 384

# endregion Config

Expand All @@ -349,22 +322,22 @@ slurm_shared_memory_size_gibibytes = 256
# It won't take effect in case of 1-GPU hosts.
# By default, true.
# ---
# nccl_benchmark_enable = false
nccl_benchmark_enable = true

# NCCL benchmark's CronJob schedule.
# By default, `0 */3 * * *` - every 3 hour.
# ---
# nccl_benchmark_enable = "0 */3 * * *"
nccl_benchmark_schedule = "0 */3 * * *"

# Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable.
# By default, 45.
# ---
# nccl_benchmark_min_threshold = 45
nccl_benchmark_min_threshold = 45

# Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test.
# By default, true
# ---
# nccl_use_infiniband = true
nccl_use_infiniband = true

# endregion NCCL benchmark

Expand All @@ -378,12 +351,12 @@ slurm_shared_memory_size_gibibytes = 256
# Whether to enable telemetry.
# By default, true.
# ---
# telemetry_enabled = false
telemetry_enabled = true

# Password of `admin` user of Grafana.
# Set it to your desired password.
# ---
telemetry_grafana_admin_password = "<YOUR-PASSWORD-FOR-GRAFANA>"
telemetry_grafana_admin_password = "password"

# endregion Telemetry

Expand All @@ -402,3 +375,32 @@ accounting_enabled = true
# endregion Accounting

# endregion Slurm

#----------------------------------------------------------------------------------------------------------------------#
# #
# Kubernetes #
# #
#----------------------------------------------------------------------------------------------------------------------#
# region k8s

# Version of the k8s to be used.
# ---
k8s_version = "1.30"

# Name of the k8s cluster.
# ---
k8s_cluster_name = "soperator"

# SSH user credentials for accessing k8s nodes.
# That option add public ip address to every node.
# By default, empty list.
# ---
# k8s_cluster_node_ssh_access_users = [{
# name = "<USER1>"
# public_keys = [
# "<ENCRYPTION-METHOD1 HASH1 USER1>",
# "<ENCRYPTION-METHOD2 HASH2 USER1>",
# ]
# }]

# endregion k8s

0 comments on commit 8d20522

Please sign in to comment.