Skip to content

Commit

Permalink
Platform and preset moved to variables across the library;
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-k-nebius committed Nov 21, 2024
1 parent 8584e20 commit 8791f52
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 10 deletions.
2 changes: 2 additions & 0 deletions k8s-inference/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
# K8s modes
cpu_nodes_count = 1 # Number of CPU nodes
cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset
cpu_nodes_platform = "cpu-e2" # The CPU node platform
gpu_nodes_count = 1 # Number of GPU nodes
gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform
gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs.

# Observability
Expand Down
3 changes: 3 additions & 0 deletions k8s-training/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@

# K8s modes
cpu_nodes_count = 1 # Number of CPU nodes
cpu_nodes_platform = "cpu-e2" # The CPU node platform
cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset
gpu_nodes_count = 1 # Number of GPU nodes
gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform
gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection
infiniband_fabric = "fabric-3" # Infiniband fabric name.


# Observability
Expand Down
4 changes: 2 additions & 2 deletions slurm/slurm-master.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "master" {
name = "slurm-master"
parent_id = var.parent_id
resources = {
platform = "cpu-e2"
preset = "4vcpu-16gb"
platform = var.master_platform
preset = var.master_preset
}
boot_disk = {
attach_mode = "READ_WRITE"
Expand Down
4 changes: 2 additions & 2 deletions slurm/slurm-worker.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ resource "nebius_compute_v1_instance" "worker" {
name = each.key
parent_id = var.parent_id
resources = {
platform = "gpu-h100-sxm"
preset = "8gpu-128vcpu-1600gb"
platform = var.worker_platform
preset = var.worker_preset
}
gpu_cluster = nebius_compute_v1_gpu_cluster.gpu-cluster-slurm

Expand Down
5 changes: 5 additions & 0 deletions slurm/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,8 @@ shared_fs_type = "filesystem" # "nfs" or "filesystem"
# key = "put your public ssh key here"
# path = "put path to ssh key here"
# }

master_platform = "cpu-e2"
master_preset = "4vcpu-16gb"
worker_platform = "gpu-h100-sxm"
worker_preset = "8gpu-128vcpu-1600gb"
23 changes: 20 additions & 3 deletions slurm/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,29 @@ variable "ssh_public_key" {
}
}

variable "platform_id" {
variable "master_platform" {
description = "Platform for Slurm Master."
type = string
description = "Platform for workers: gpu-h100-b for Inspur or gpu-h100 for Gigabyte"
default = "gpu-h100-b"
default = "cpu-e2"
}

variable "master_preset" {
description = "Preset for Slurm Master."
type = string
default = "4vcpu-16gb"
}

variable "worker_platform" {
description = "Platform for Slurm Worker."
type = string
default = "gpu-h100-sxm"
}

variable "worker_preset" {
description = "Preset for Slurm Worker."
type = string
default = "8gpu-128vcpu-1600gb"
}

variable "mysql_jobs_backend" {
type = bool
Expand Down
4 changes: 2 additions & 2 deletions wireguard/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "wireguard_instance" {
]

resources = {
platform = "cpu-e2"
preset = "4vcpu-16gb"
platform = var.platform
preset = var.preset
}


Expand Down
3 changes: 3 additions & 0 deletions wireguard/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@
# path = "put path to ssh key here"
# }
# public_ip_allocation_id = ""

platform = cpu-e2
preset = "4vcpu-16gb"
16 changes: 15 additions & 1 deletion wireguard/variables.tf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# K8s cluster
# Global parameters
variable "parent_id" {
description = "Project ID."
type = string
Expand All @@ -9,6 +9,20 @@ variable "subnet_id" {
type = string
}


# Platform
variable "platform" {
description = "Platform for WireGuard host."
type = string
default = "cpu-e2"
}

variable "preset" {
description = "Preset for WireGuard host."
type = string
default = "4vcpu-16gb"
}

# SSH access
variable "ssh_user_name" {
description = "SSH username."
Expand Down

0 comments on commit 8791f52

Please sign in to comment.