Skip to content

Commit

Permalink
Merge pull request #87 from nebius/main
Browse files Browse the repository at this point in the history
Backporting from main
  • Loading branch information
shoguevara authored Nov 15, 2024
2 parents f2d1209 + 624f56f commit 990a7a8
Show file tree
Hide file tree
Showing 17 changed files with 262 additions and 192 deletions.
5 changes: 0 additions & 5 deletions soperator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ SHELL = /usr/bin/env bash -o pipefail
SOPERATOR_VERSION = $(shell cat VERSION)
SUBVERSION = $(shell cat SUBVERSION)
VERSION = $(SOPERATOR_VERSION)-$(SUBVERSION)
DOCKER_REGISTRY_NAME = soperator

ifeq ($(shell uname), Darwin)
SED_COMMAND = sed -i ''
Expand All @@ -16,10 +15,6 @@ endif
sync-version: ## Sync Soperator version from file
@echo 'Soperator version is - $(SOPERATOR_VERSION)'

@# region modules/slurm/locals.tf
@$(SED_COMMAND) "s|\(oci://cr.eu-north1.nebius.cloud/\)[^\"]*|\1$(DOCKER_REGISTRY_NAME)|" modules/slurm/locals.tf
@# endregion modules/slurm/locals.tf

@# region installations/example/terraform.tfvars
@echo 'Syncing installations/example/terraform.tfvars'
@$(SED_COMMAND) -E 's/slurm_operator_version *= *"[0-9]+.[0-9]+.[0-9]+[^ ]*"/slurm_operator_version = "$(SOPERATOR_VERSION)"/' installations/example/terraform.tfvars
Expand Down
2 changes: 1 addition & 1 deletion soperator/SUBVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0
1
3 changes: 1 addition & 2 deletions soperator/installations/example/.envrc
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,8 @@ NEBIUS_GROUP_EDITORS_ID=$(nebius iam group get-by-name \
# Separate declaration and assignment for service account
NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account list \
--parent-id "${NEBIUS_PROJECT_ID}" \
--filter "name=slurm-terraform-sa" \
--format json \
| jq -r '.items[0].metadata.id')
| jq -r '.items[] | select(.metadata.name == "slurm-terraform-sa").metadata.id')

if [ -z "$NEBIUS_SA_TERRAFORM_ID" ]; then
NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account create \
Expand Down
6 changes: 3 additions & 3 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ locals {
controller = module.resources.this[var.slurm_nodeset_controller.resource.platform][var.slurm_nodeset_controller.resource.preset]
workers = [for worker in var.slurm_nodeset_workers : module.resources.this[worker.resource.platform][worker.resource.preset]]
login = module.resources.this[var.slurm_nodeset_login.resource.platform][var.slurm_nodeset_login.resource.preset]
accounting = module.resources.this[var.slurm_nodeset_accounting.resource.platform][var.slurm_nodeset_accounting.resource.preset]
accounting = var.slurm_nodeset_accounting != null ? module.resources.this[var.slurm_nodeset_accounting.resource.platform][var.slurm_nodeset_accounting.resource.preset] : null
}

use_node_port = var.slurm_login_service_type == "NodePort"
Expand Down Expand Up @@ -210,11 +210,11 @@ module "slurm" {
memory_gibibytes = local.resources.login.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_login.boot_disk.size_gibibytes / 2)
}
accounting = {
accounting = var.accounting_enabled ? {
cpu_cores = local.resources.accounting.cpu_cores
memory_gibibytes = local.resources.accounting.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.slurm_nodeset_accounting.boot_disk.size_gibibytes / 2)
}
} : null
}

login_service_type = var.slurm_login_service_type
Expand Down
2 changes: 1 addition & 1 deletion soperator/installations/example/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ variable "slurm_nodeset_workers" {
validation {
condition = length([for worker in var.slurm_nodeset_workers :
1 if worker.size % worker.split_factor != 0
]) == 1
]) == 0
error_message = "Worker count must be divisible by split_factor."
}
}
Expand Down
1 change: 1 addition & 0 deletions soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ cleanup_preload_shared() {
#########################################################################
if [ -n "${CONTAINER_PRELOAD_SHARED_PATH}" ]; then
CONT_FILE="${CONTAINER_PRELOAD_SHARED_PATH}/containers/${SLURM_JOBID}_$(basename ${CONT}).squashfs"
mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers"
# Prepull container image to the shared filesystem
srun --ntasks=1 enroot import --output ${CONT_FILE} docker://${CONT}
else
Expand Down
1 change: 0 additions & 1 deletion soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,3 @@ sbatch \
run.sub

squeue

Loading

0 comments on commit 990a7a8

Please sign in to comment.