Skip to content

Commit

Permalink
Merge pull request #118 from nebius/release/soperator-prerelease
Browse files Browse the repository at this point in the history
Release soperator 1.16.0
  • Loading branch information
asteny authored Dec 16, 2024
2 parents 0ed4a5e + 772c7e3 commit 4289945
Show file tree
Hide file tree
Showing 13 changed files with 101 additions and 71 deletions.
1 change: 1 addition & 0 deletions soperator/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
.terraform*
installations/alexkim
16 changes: 7 additions & 9 deletions soperator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ Before starting, ensure you have these tools installed:
- [Terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli)
- [Nebius CLI ](https://nebius.com/docs/cli/quickstart)
- [kubectl](https://kubernetes.io/docs/tasks/tools/)
- AWS CLI: `python -m pip install awscli`
- [jq](https://jqlang.github.io/jq/download/)
- coreutils:
- macOS: `brew install coreutils`
Expand All @@ -31,18 +30,17 @@ Before starting, ensure you have these tools installed:

1. **Create Your Installation Directory**
```bash
mkdir -p installations/<your-name>
cd installations/<your-name>
export INSTALLATION_NAME=<your-name> # e.g. customer name
mkdir -p installations/$INSTALLATION_NAME
cd installations/$INSTALLATION_NAME
cp -r ../example/ ./
```

2. **Set Up Your Environment**
```bash
# Set your Nebius project details
export NEBIUS_TENANT_ID='<your-tenant-id>'
export NEBIUS_PROJECT_ID='<your-project-id>'

# Load environment variables
Set your NEBIUS_TENANT_ID and NEBIUS_PROJECT_ID in the `.envrc` file, then run:

```bash
source .envrc
```

Expand Down Expand Up @@ -166,4 +164,4 @@ tail -f outputs/nccl.out
# Container test
sbatch enroot.sh
tail -f outputs/enroot.out
```
```
2 changes: 1 addition & 1 deletion soperator/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.15.5
1.16.0
101 changes: 52 additions & 49 deletions soperator/installations/example/.envrc
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
NEBIUS_TENANT_ID='tenant-...'
NEBIUS_PROJECT_ID='project-...'

if [ -z "${NEBIUS_TENANT_ID}" ]; then
echo "Error: NEBIUS_TENANT_ID is not set"
return 1
echo "Error: NEBIUS_TENANT_ID is not set"
return 1
fi

if [ -z "${NEBIUS_PROJECT_ID}" ]; then
echo "Error: NEBIUS_PROJECT_ID is not set"
return 1
echo "Error: NEBIUS_PROJECT_ID is not set"
return 1
fi

# Separate declaration and assignment for IAM token
Expand All @@ -14,24 +17,26 @@ nebius iam whoami > /dev/null
nebius iam get-access-token > /dev/null
NEBIUS_IAM_TOKEN=$(nebius iam get-access-token)
export NEBIUS_IAM_TOKEN
export TF_VAR_iam_token="${NEBIUS_IAM_TOKEN}"
export TF_VAR_iam_tenant_id="${NEBIUS_TENANT_ID}"
export TF_VAR_iam_project_id="${NEBIUS_PROJECT_ID}"

# Separate declaration and assignment for VPC subnet
NEBIUS_VPC_SUBNET_ID=$(nebius vpc subnet list \
--parent-id "${NEBIUS_PROJECT_ID}" \
--format json \
| jq -r '.items[0].metadata.id')
export NEBIUS_VPC_SUBNET_ID

# Export Nebius Cloud metadata to Terraform variables
export TF_VAR_iam_token="${NEBIUS_IAM_TOKEN}"
export TF_VAR_iam_tenant_id="${NEBIUS_TENANT_ID}"
export TF_VAR_iam_project_id="${NEBIUS_PROJECT_ID}"
export TF_VAR_vpc_subnet_id="${NEBIUS_VPC_SUBNET_ID}"

# Separate declaration and assignment for group editors
NEBIUS_GROUP_EDITORS_ID=$(nebius iam group get-by-name \
--parent-id "${NEBIUS_TENANT_ID}" \
--name 'editors' \
--format json \
| jq -r '.metadata.id')
--parent-id "${NEBIUS_TENANT_ID}" \
--name 'editors' \
--format json \
| jq -r '.metadata.id')

# Separate declaration and assignment for service account
NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account list \
Expand All @@ -52,45 +57,45 @@ fi

# Check if service account is already a member of editors group
IS_MEMBER=$(nebius iam group-membership list-members \
--parent-id "${NEBIUS_GROUP_EDITORS_ID}" \
--format json \
| jq -r --arg SAID "${NEBIUS_SA_TERRAFORM_ID}" '.memberships[] | select(.spec.member_id == $SAID) | .spec.member_id')
--parent-id "${NEBIUS_GROUP_EDITORS_ID}" \
--page-size 1000 \
--format json \
| jq -r --arg SAID "${NEBIUS_SA_TERRAFORM_ID}" '.memberships[] | select(.spec.member_id == $SAID) | .spec.member_id')


# Add service account to group editors only if not already a member
if [ -z "${IS_MEMBER}" ]; then
nebius iam group-membership create \
--parent-id "${NEBIUS_GROUP_EDITORS_ID}" \
--member-id "${NEBIUS_SA_TERRAFORM_ID}"
echo "Added service account to editors group"
nebius iam group-membership create \
--parent-id "${NEBIUS_GROUP_EDITORS_ID}" \
--member-id "${NEBIUS_SA_TERRAFORM_ID}"
echo "Added service account to editors group"
else
echo "Service account is already a member of editors group"
echo "Service account is already a member of editors group"
fi

# Separate declaration and assignment for access key
NEBIUS_SA_ACCESS_KEY_ID=$(nebius iam access-key list \
echo 'Creating new access key for Object Storage'
NEBIUS_SA_ACCESS_KEY_ID=$(nebius iam access-key create \
--parent-id "${NEBIUS_PROJECT_ID}" \
--name "slurm-tf-ak-$(date +%s)" \
--account-service-account-id "${NEBIUS_SA_TERRAFORM_ID}" \
--description 'Temporary S3 Access' \
--expires-at "$(date -v+1d '+%Y-%m-%dT%H:%M:%SZ')" \
--format json \
| jq -r '.items // [] | map(select(.metadata.name == "slurm-terraform-sa-access-key")) | .[0].metadata.id // empty')

if [ -z "${NEBIUS_SA_ACCESS_KEY_ID}" ]; then
NEBIUS_SA_ACCESS_KEY_ID=$(nebius iam access-key create \
--parent-id "${NEBIUS_PROJECT_ID}" \
--name 'slurm-terraform-sa-access-key' \
--account-service-account-id "${NEBIUS_SA_TERRAFORM_ID}" \
--description 'AWS CLI key' \
--format json \
| jq -r '.resource_id')
fi
| jq -r '.resource_id')
echo "Created new access key: ${NEBIUS_SA_ACCESS_KEY_ID}"

# Separate declaration and assignment for AWS access key
AWS_ACCESS_KEY_ID=$(nebius iam access-key get-by-id \
--id "${NEBIUS_SA_ACCESS_KEY_ID}" \
--format json | jq -r '.status.aws_access_key_id')
export AWS_ACCESS_KEY_ID

if [ -f '.aws_secret_access_key' ] && [ -s '.aws_secret_access_key' ]; then
if [ "$(tr -d '[:space:]' < .aws_secret_access_key | wc -c)" -gt 0 ]; then
echo "Using existing AWS_SECRET_ACCESS_KEY from .aws_secret_access_key"
AWS_SECRET_ACCESS_KEY="$(cat '.aws_secret_access_key')"
else
echo "Generating new AWS_SECRET_ACCESS_KEY"
AWS_SECRET_ACCESS_KEY="$(nebius iam access-key get-secret-once \
--id "${NEBIUS_SA_ACCESS_KEY_ID}" \
--format json \
Expand All @@ -99,32 +104,30 @@ else
fi

if [ -z "${AWS_SECRET_ACCESS_KEY}" ]; then
echo "Error: AWS_SECRET_ACCESS_KEY is empty" >&2
exit 1
echo "!!! ERROR: AWS_SECRET_ACCESS_KEY is empty !!!"
echo " To generate a new key, first delete the existing key:"
echo " nebius iam access-key delete --id-id "${NEBIUS_SA_ACCESS_KEY_ID}""
echo " Then rerun 'source .envrc' again."
fi

export AWS_SECRET_ACCESS_KEY

aws configure set aws_access_key_id "${AWS_ACCESS_KEY_ID}"
aws configure set aws_secret_access_key "${AWS_SECRET_ACCESS_KEY}"
aws configure set region 'eu-north1'
aws configure set endpoint_url 'https://storage.eu-north1.nebius.cloud:443'

export NEBIUS_BUCKET_NAME="tfstate-slurm-k8s-$(echo -n "${NEBIUS_TENANT_ID}-${NEBIUS_PROJECT_ID}" | md5sum | awk '$0=$1')"
NEBIUS_BUCKET_NAME="tfstate-slurm-k8s-$(echo -n "${NEBIUS_TENANT_ID}-${NEBIUS_PROJECT_ID}" | md5sum | awk '$0=$1')"
export NEBIUS_BUCKET_NAME
# Check if bucket exists
EXISTING_BUCKET=$(nebius storage bucket list \
--parent-id "${NEBIUS_PROJECT_ID}" \
--format json \
| jq -r --arg BUCKET "${NEBIUS_BUCKET_NAME}" '.items[] | select(.metadata.name == $BUCKET) | .metadata.name')
--parent-id "${NEBIUS_PROJECT_ID}" \
--format json \
| jq -r --arg BUCKET "${NEBIUS_BUCKET_NAME}" '.items[] | select(.metadata.name == $BUCKET) | .metadata.name')

if [ -z "${EXISTING_BUCKET}" ]; then
nebius storage bucket create \
--name "${NEBIUS_BUCKET_NAME}" \
--parent-id "${NEBIUS_PROJECT_ID}" \
--versioning-policy 'enabled'
echo "Created bucket: ${NEBIUS_BUCKET_NAME}"
nebius storage bucket create \
--name "${NEBIUS_BUCKET_NAME}" \
--parent-id "${NEBIUS_PROJECT_ID}" \
--versioning-policy 'enabled'
echo "Created bucket: ${NEBIUS_BUCKET_NAME}"
else
echo "Using existing bucket: ${NEBIUS_BUCKET_NAME}"
echo "Using existing bucket: ${NEBIUS_BUCKET_NAME}"
fi

export TFE_PARALLELISM=20
Expand Down
10 changes: 5 additions & 5 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -190,30 +190,30 @@ module "slurm" {
system = {
cpu_cores = local.resources.system.cpu_cores
memory_gibibytes = local.resources.system.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_system.boot_disk.size_gibibytes / 2)
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_system.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
}
controller = {
cpu_cores = local.resources.controller.cpu_cores
memory_gibibytes = local.resources.controller.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_controller.boot_disk.size_gibibytes / 2)
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_controller.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
}
worker = [for i, worker in var.slurm_nodeset_workers :
{
cpu_cores = local.resources.workers[i].cpu_cores
memory_gibibytes = local.resources.workers[i].memory_gibibytes
ephemeral_storage_gibibytes = ceil(worker.boot_disk.size_gibibytes / 2)
ephemeral_storage_gibibytes = ceil(worker.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
gpus = local.resources.workers[i].gpus
}
]
login = {
cpu_cores = local.resources.login.cpu_cores
memory_gibibytes = local.resources.login.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_login.boot_disk.size_gibibytes / 2)
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_login.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
}
accounting = var.accounting_enabled ? {
cpu_cores = local.resources.accounting.cpu_cores
memory_gibibytes = local.resources.accounting.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_accounting.boot_disk.size_gibibytes / 2)
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_accounting.boot_disk.size_gibibytes - module.resources.k8s_ephemeral_storage_reserve.gibibytes)
} : null
}

Expand Down
2 changes: 1 addition & 1 deletion soperator/installations/example/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ terraform {
required_providers {
nebius = {
source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius"
version = "0.4.4"
version = ">=0.4"
}

units = {
Expand Down
2 changes: 1 addition & 1 deletion soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ slurm_cluster_name = "soperator"

# Version of soperator.
# ---
slurm_operator_version = "1.15.5"
slurm_operator_version = "1.16.0"

# Type of the Slurm partition config. Could be either `default` or `custom`.
# By default, "default".
Expand Down
4 changes: 4 additions & 0 deletions soperator/modules/available_resources/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,7 @@ locals {
})
})
}

data "units_data_size" "k8s_ephemeral_storage_reserve" {
gibibytes = 64
}
4 changes: 4 additions & 0 deletions soperator/modules/available_resources/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@ output "this" {
description = "Map of available node resources grouped by platform -> preset."
value = local.resources
}

output "k8s_ephemeral_storage_reserve" {
value = data.units_data_size.k8s_ephemeral_storage_reserve
}
8 changes: 8 additions & 0 deletions soperator/modules/available_resources/terraform.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
terraform {
required_providers {
units = {
source = "dstaroff/units"
}
}
}

module "labels" {
source = "../labels"
}
15 changes: 10 additions & 5 deletions soperator/modules/slurm/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ resource "helm_release" "slurm_operator" {
name = "controllerManager.manager.env.isMariadbCrdInstalled"
value = var.accounting_enabled
}
set {
name = "certManager.enabled"
value = var.telemetry_enabled
}

wait = true
wait_for_jobs = true
Expand Down Expand Up @@ -163,15 +167,16 @@ resource "helm_release" "slurm_cluster" {

nccl_topology_type = var.nccl_topology_type
nccl_benchmark = {
enable = var.nccl_benchmark_enable
schedule = var.nccl_benchmark_schedule
min_threshold = var.nccl_benchmark_min_threshold
use_infiniband = var.nccl_use_infiniband
enable = var.nccl_benchmark_enable
schedule = var.nccl_benchmark_schedule
min_threshold = var.nccl_benchmark_min_threshold
use_infiniband = var.nccl_use_infiniband
}

nodes = {
accounting = {
enabled = var.accounting_enabled
enabled = var.accounting_enabled
use_protected_secret = var.use_protected_secret
mariadb_operator = var.accounting_enabled ? {
enabled = var.accounting_enabled
storage_size = var.accounting_enabled ? var.filestores.accounting.size_gibibytes : 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ slurmNodes:
mariadbOperator:
enabled: ${nodes.accounting.mariadb_operator.enabled}
%{~ if nodes.accounting.mariadb_operator.enabled ~}
protectedSecret: ${nodes.accounting.use_protected_secret}
resources:
cpu: ${nodes.accounting.mariadb_operator.resources.cpu * 1000}m
memory: ${nodes.accounting.mariadb_operator.resources.memory}Gi
Expand Down
6 changes: 6 additions & 0 deletions soperator/modules/slurm/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,12 @@ variable "accounting_enabled" {
default = false
}

variable "use_protected_secret" {
description = "If true, protected user secret MariaDB will not be deleted after the MariaDB CR is deleted."
type = bool
default = false
}

variable "slurmdbd_config" {
description = "Slurmdbd.conf configuration. See https://slurm.schedmd.com/slurmdbd.conf.html.Not all options are supported."
type = map(any)
Expand Down

0 comments on commit 4289945

Please sign in to comment.