Skip to content

Commit

Permalink
Merge pull request #132 from nebius/release/soperator-prerelease
Browse files Browse the repository at this point in the history
Release Soperator 1.16.1-1
  • Loading branch information
dstaroff authored Dec 23, 2024
2 parents 911004e + 1846ba3 commit 1cfacf1
Show file tree
Hide file tree
Showing 115 changed files with 25,195 additions and 22,189 deletions.
File renamed without changes.
2 changes: 1 addition & 1 deletion modules/nfs-server/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ resource "nebius_compute_v1_instance" "nfs_server" {
}
]

cloud_init_user_data = templatefile("../modules/cloud-init/nfs-cloud-init.tftpl", {
cloud_init_user_data = templatefile("${path.module}/files/nfs-cloud-init.tftpl", {
ssh_user_name = var.ssh_user_name,
ssh_public_key = var.ssh_public_key,
nfs_ip_range = var.nfs_ip_range,
Expand Down
2 changes: 1 addition & 1 deletion soperator/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.16.0
1.16.1
82 changes: 61 additions & 21 deletions soperator/installations/example/.envrc
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,46 @@ if [ -z "${NEBIUS_PROJECT_ID}" ]; then
return 1
fi

# Separate declaration and assignment for IAM token
# region IAM token

unset NEBIUS_IAM_TOKEN
nebius iam whoami > /dev/null
nebius iam get-access-token > /dev/null
NEBIUS_IAM_TOKEN=$(nebius iam get-access-token)
export NEBIUS_IAM_TOKEN

# Separate declaration and assignment for VPC subnet
# endregion IAM token

# region VPC subnet

NEBIUS_VPC_SUBNET_ID=$(nebius vpc subnet list \
--parent-id "${NEBIUS_PROJECT_ID}" \
--format json \
| jq -r '.items[0].metadata.id')
export NEBIUS_VPC_SUBNET_ID

# Export Nebius Cloud metadata to Terraform variables
# endregion VPC subnet

# region TF variables

export TF_VAR_iam_token="${NEBIUS_IAM_TOKEN}"
export TF_VAR_iam_tenant_id="${NEBIUS_TENANT_ID}"
export TF_VAR_iam_project_id="${NEBIUS_PROJECT_ID}"
export TF_VAR_vpc_subnet_id="${NEBIUS_VPC_SUBNET_ID}"
export TFE_PARALLELISM=20

# Separate declaration and assignment for group editors
NEBIUS_GROUP_EDITORS_ID=$(nebius iam group get-by-name \
--parent-id "${NEBIUS_TENANT_ID}" \
--name 'editors' \
--format json \
| jq -r '.metadata.id')
echo "Exported variables:"
echo "NEBIUS_TENANT_ID: ${NEBIUS_TENANT_ID}"
echo "NEBIUS_PROJECT_ID: ${NEBIUS_PROJECT_ID}"
echo "NEBIUS_VPC_SUBNET_ID: ${NEBIUS_VPC_SUBNET_ID}"
echo "TFE_PARALLELISM: ${TFE_PARALLELISM}"

# endregion TF variables

# region Remote state

# region Service account

# Separate declaration and assignment for service account
NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account list \
--parent-id "${NEBIUS_PROJECT_ID}" \
--format json \
Expand All @@ -55,7 +67,16 @@ else
echo "Found existing service account with ID: $NEBIUS_SA_TERRAFORM_ID"
fi

# Check if service account is already a member of editors group
# endregion Service account

# region `editors` group

NEBIUS_GROUP_EDITORS_ID=$(nebius iam group get-by-name \
--parent-id "${NEBIUS_TENANT_ID}" \
--name 'editors' \
--format json \
| jq -r '.metadata.id')

IS_MEMBER=$(nebius iam group-membership list-members \
--parent-id "${NEBIUS_GROUP_EDITORS_ID}" \
--page-size 1000 \
Expand All @@ -73,19 +94,35 @@ else
echo "Service account is already a member of editors group"
fi

# Separate declaration and assignment for access key
# endregion `editors` group

# region Access key

DATE_FORMAT='+%Y-%m-%dT%H:%M:%SZ'

if [[ "$(uname)" == "Darwin" ]]; then
# macOS
EXPIRATION_DATE=$(date -v +1d "${DATE_FORMAT}")
else
# Linux (assumes GNU date)
EXPIRATION_DATE=$(date -d '+1 day' "${DATE_FORMAT}")
fi

echo 'Creating new access key for Object Storage'
NEBIUS_SA_ACCESS_KEY_ID=$(nebius iam access-key create \
--parent-id "${NEBIUS_PROJECT_ID}" \
--name "slurm-tf-ak-$(date +%s)" \
--account-service-account-id "${NEBIUS_SA_TERRAFORM_ID}" \
--description 'Temporary S3 Access' \
--expires-at "$(date -v+1d '+%Y-%m-%dT%H:%M:%SZ')" \
--expires-at "${EXPIRATION_DATE}" \
--format json \
| jq -r '.resource_id')
echo "Created new access key: ${NEBIUS_SA_ACCESS_KEY_ID}"

# Separate declaration and assignment for AWS access key
# endregion Access key

# region AWS access key

AWS_ACCESS_KEY_ID=$(nebius iam access-key get-by-id \
--id "${NEBIUS_SA_ACCESS_KEY_ID}" \
--format json | jq -r '.status.aws_access_key_id')
Expand All @@ -112,6 +149,10 @@ fi

export AWS_SECRET_ACCESS_KEY

# endregion AWS access key

# region Bucket

NEBIUS_BUCKET_NAME="tfstate-slurm-k8s-$(echo -n "${NEBIUS_TENANT_ID}-${NEBIUS_PROJECT_ID}" | md5sum | awk '$0=$1')"
export NEBIUS_BUCKET_NAME
# Check if bucket exists
Expand All @@ -130,14 +171,9 @@ else
echo "Using existing bucket: ${NEBIUS_BUCKET_NAME}"
fi

export TFE_PARALLELISM=20
# endregion Bucket

# print all exported variables
echo "Exported variables:"
echo "NEBIUS_TENANT_ID: ${NEBIUS_TENANT_ID}"
echo "NEBIUS_PROJECT_ID: ${NEBIUS_PROJECT_ID}"
echo "NEBIUS_BUCKET_NAME: ${NEBIUS_BUCKET_NAME}"
echo "TFE_PARALLELISM: ${TFE_PARALLELISM}"
# region Backend override

cat > terraform_backend_override.tf << EOF
terraform {
Expand All @@ -157,3 +193,7 @@ terraform {
}
}
EOF

# endregion Backend override

# endregion Remote state
93 changes: 73 additions & 20 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@ module "filestore" {
}
}

module "nfs-server" {
count = var.nfs.enabled ? 1 : 0
source = "../../../modules/nfs-server"
parent_id = data.nebius_iam_v1_project.this.id
subnet_id = data.nebius_vpc_v1_subnet.this.id
ssh_user_name = "soperator"
ssh_public_key = var.slurm_login_ssh_root_public_keys[0]
nfs_ip_range = data.nebius_vpc_v1_subnet.this.ipv4_private_pools.pools[0].cidrs[0].cidr
nfs_size = var.nfs.size_gibibytes * 1024 * 1024 * 1024
nfs_path = "/mnt/nfs"
platform = var.nfs.resource.platform
preset = var.nfs.resource.preset

providers = {
nebius = nebius
}
}

module "k8s" {
depends_on = [
module.filestore,
Expand All @@ -83,14 +101,15 @@ module "k8s" {
k8s_version = var.k8s_version
name = var.k8s_cluster_name
slurm_cluster_name = var.slurm_cluster_name
company_name = var.company_name

node_group_system = var.slurm_nodeset_system
node_group_controller = var.slurm_nodeset_controller
node_group_workers = flatten([for i, nodeset in var.slurm_nodeset_workers :
[
for subset in range(ceil(nodeset.size / nodeset.split_factor)) :
for subset in range(ceil(nodeset.size / nodeset.nodes_per_nodegroup)) :
{
size = nodeset.split_factor
size = nodeset.nodes_per_nodegroup
max_unavailable_percent = nodeset.max_unavailable_percent
resource = nodeset.resource
boot_disk = nodeset.boot_disk
Expand Down Expand Up @@ -177,8 +196,11 @@ module "slurm" {

source = "../../modules/slurm"

name = var.slurm_cluster_name
operator_version = var.slurm_operator_version
name = var.slurm_cluster_name
operator_version = var.slurm_operator_version
k8s_cluster_context = module.k8s.cluster_context

iam_project_id = var.iam_project_id

node_count = {
controller = var.slurm_nodeset_controller.size
Expand All @@ -188,32 +210,54 @@ module "slurm" {

resources = {
system = {
cpu_cores = local.resources.system.cpu_cores
memory_gibibytes = local.resources.system.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_system.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
cpu_cores = local.resources.system.cpu_cores
memory_gibibytes = local.resources.system.memory_gibibytes
ephemeral_storage_gibibytes = floor(
module.resources.k8s_ephemeral_storage_coefficient * var.slurm_nodeset_system.boot_disk.size_gibibytes
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
}
controller = {
cpu_cores = local.resources.controller.cpu_cores
memory_gibibytes = local.resources.controller.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_controller.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
cpu_cores = local.resources.controller.cpu_cores
memory_gibibytes = local.resources.controller.memory_gibibytes
ephemeral_storage_gibibytes = floor(
module.resources.k8s_ephemeral_storage_coefficient * var.slurm_nodeset_controller.boot_disk.size_gibibytes
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
}
worker = [for i, worker in var.slurm_nodeset_workers :
{
cpu_cores = local.resources.workers[i].cpu_cores
memory_gibibytes = local.resources.workers[i].memory_gibibytes
ephemeral_storage_gibibytes = ceil(worker.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
gpus = local.resources.workers[i].gpus
cpu_cores = local.resources.workers[i].cpu_cores
memory_gibibytes = local.resources.workers[i].memory_gibibytes
ephemeral_storage_gibibytes = (
module.k8s.gpu_involved
? floor(
module.resources.k8s_ephemeral_storage_coefficient * worker.boot_disk.size_gibibytes
-2 * module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
: floor(
module.resources.k8s_ephemeral_storage_coefficient * worker.boot_disk.size_gibibytes
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
)
gpus = local.resources.workers[i].gpus
}
]
login = {
cpu_cores = local.resources.login.cpu_cores
memory_gibibytes = local.resources.login.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_login.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
cpu_cores = local.resources.login.cpu_cores
memory_gibibytes = local.resources.login.memory_gibibytes
ephemeral_storage_gibibytes = floor(
module.resources.k8s_ephemeral_storage_coefficient * var.slurm_nodeset_login.boot_disk.size_gibibytes
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
}
accounting = var.accounting_enabled ? {
cpu_cores = local.resources.accounting.cpu_cores
memory_gibibytes = local.resources.accounting.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_accounting.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
cpu_cores = local.resources.accounting.cpu_cores
memory_gibibytes = local.resources.accounting.memory_gibibytes
ephemeral_storage_gibibytes = floor(
module.resources.k8s_ephemeral_storage_coefficient * var.slurm_nodeset_accounting.boot_disk.size_gibibytes
-module.resources.k8s_ephemeral_storage_reserve.gibibytes
)
} : null
}

Expand Down Expand Up @@ -249,6 +293,13 @@ module "slurm" {
} : null
}

nfs = {
enabled = var.nfs.enabled
path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null
host = var.nfs.enabled ? module.nfs-server[0].nfs_server_internal_ip : null
mount_path = var.nfs.enabled ? var.nfs.mount_path : null
}

shared_memory_size_gibibytes = var.slurm_shared_memory_size_gibibytes

nccl_topology_type = var.slurm_nodeset_workers[0].resource.platform == "gpu-h100-sxm" ? "H100 GPU cluster" : "auto"
Expand Down Expand Up @@ -278,6 +329,8 @@ module "login_script" {
}
slurm_cluster_name = var.slurm_cluster_name

k8s_cluster_context = module.k8s.cluster_context

providers = {
kubernetes = kubernetes
}
Expand Down
25 changes: 21 additions & 4 deletions soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
# #
#----------------------------------------------------------------------------------------------------------------------#

# Name of the company. It is used for context name of the cluster in .kubeconfig file.
company_name = "company"

#----------------------------------------------------------------------------------------------------------------------#
# #
# #
Expand Down Expand Up @@ -94,6 +97,20 @@ filestore_accounting = {

# endregion Storage

# region nfs-server

# nfs = {
# enabled = true
# size_gibibytes = 93
# mount_path = "/mnt/nfs"
# resource = {
# platform = "cpu-e2"
# preset = "16vcpu-64gb"
# }
# }

# endregion nfs-server

#----------------------------------------------------------------------------------------------------------------------#
# #
# #
Expand Down Expand Up @@ -140,7 +157,7 @@ slurm_cluster_name = "soperator"

# Version of soperator.
# ---
slurm_operator_version = "1.16.0"
slurm_operator_version = "1.16.1"

# Type of the Slurm partition config. Could be either `default` or `custom`.
# By default, "default".
Expand Down Expand Up @@ -195,13 +212,13 @@ slurm_nodeset_controller = {

# Configuration of Slurm Worker node sets.
# There can be only one Worker node set for a while.
# Split factor allows you to split node set into equally-sized node groups to keep your cluster accessible and working
# during maintenance. Example: split_factor 3 for 12 nodes will create for you 3 groups with 4 nodes in every group.
# nodes_per_nodegroup allows you to split node set into equally-sized node groups to keep your cluster accessible and working
# during maintenance. Example: nodes_per_nodegroup=3 for size=12 nodes will create 4 groups with 3 nodes in every group.
# infiniband_fabric is required field
# ---
slurm_nodeset_workers = [{
size = 16
split_factor = 4
nodes_per_nodegroup = 4
max_unavailable_percent = 50
resource = {
platform = "gpu-h100-sxm"
Expand Down
Loading

0 comments on commit 1cfacf1

Please sign in to comment.