Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Platform and preset moved to variables across the library; #94

Merged
merged 22 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
8791f52
Platform and preset moved to variables across the library;
elijah-k-nebius Nov 21, 2024
04ab264
Added "region" variable to control platform defaults (k8s-inference);
elijah-k-nebius Nov 21, 2024
5eed389
Added "region" variable to control platform defaults (k8s-inference (…
elijah-k-nebius Nov 21, 2024
1984342
Added "region" variable to control platform defaults (k8s-inference (…
elijah-k-nebius Nov 21, 2024
3c8423a
Added "region" variable to control platform defaults (k8s-training);
elijah-k-nebius Nov 21, 2024
ab7b2de
Added "region" variable to control platform defaults (GlusterFS module);
elijah-k-nebius Nov 21, 2024
5216c7a
Added "region" variable to control platform defaults (NFS Server);
elijah-k-nebius Nov 21, 2024
e704aad
Tf fmt
d3vil-st Nov 21, 2024
3f3ce80
Tests fixed;
elijah-k-nebius Nov 21, 2024
d5512e2
Tf fmt;
elijah-k-nebius Nov 21, 2024
2991b9e
Tests fixed (2);
elijah-k-nebius Nov 21, 2024
1948cca
Added "region" variable to control platform defaults (WireGuard);
elijah-k-nebius Nov 21, 2024
157adb9
Added "region" variable to control platform defaults (Slurm);
elijah-k-nebius Nov 21, 2024
4a55aaf
terraform.tfvars files refactored;
elijah-k-nebius Nov 21, 2024
f4e08df
TF fmt
d3vil-st Nov 21, 2024
5a18cda
Added region variables for tests
d3vil-st Nov 21, 2024
26de355
Clean region variable for tests
d3vil-st Nov 21, 2024
1f26685
Clean region variable for tests
d3vil-st Nov 21, 2024
8d281b8
Added "region" variable to control platform defaults (GlusterFS (2));
elijah-k-nebius Nov 21, 2024
b7f9b41
Presets fixed;
elijah-k-nebius Nov 21, 2024
398051a
Added "region" variable to control platform defaults (WireGuard (2));
elijah-k-nebius Nov 21, 2024
3a0ab6b
Tests fixed (3);
elijah-k-nebius Nov 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions k8s-inference/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
# K8s modes
cpu_nodes_count = 1 # Number of CPU nodes
cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset
cpu_nodes_platform = "cpu-e2" # The CPU node platform
gpu_nodes_count = 1 # Number of GPU nodes
gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform
gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs.

# Observability
Expand Down
3 changes: 3 additions & 0 deletions k8s-training/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@

# K8s modes
cpu_nodes_count = 1 # Number of CPU nodes
cpu_nodes_platform = "cpu-e2" # The CPU node platform
cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset
gpu_nodes_count = 1 # Number of GPU nodes
gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform
gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection
infiniband_fabric = "fabric-3" # Infiniband fabric name.


# Observability
Expand Down
4 changes: 2 additions & 2 deletions slurm/slurm-master.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "master" {
name = "slurm-master"
parent_id = var.parent_id
resources = {
platform = "cpu-e2"
preset = "4vcpu-16gb"
platform = var.master_platform
preset = var.master_preset
}
boot_disk = {
attach_mode = "READ_WRITE"
Expand Down
4 changes: 2 additions & 2 deletions slurm/slurm-worker.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ resource "nebius_compute_v1_instance" "worker" {
name = each.key
parent_id = var.parent_id
resources = {
platform = "gpu-h100-sxm"
preset = "8gpu-128vcpu-1600gb"
platform = var.worker_platform
preset = var.worker_preset
}
gpu_cluster = nebius_compute_v1_gpu_cluster.gpu-cluster-slurm

Expand Down
5 changes: 5 additions & 0 deletions slurm/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,8 @@ shared_fs_type = "filesystem" # "nfs" or "filesystem"
# key = "put your public ssh key here"
# path = "put path to ssh key here"
# }

master_platform = "cpu-e2"
master_preset = "4vcpu-16gb"
worker_platform = "gpu-h100-sxm"
worker_preset = "8gpu-128vcpu-1600gb"
23 changes: 20 additions & 3 deletions slurm/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,29 @@ variable "ssh_public_key" {
}
}

variable "platform_id" {
variable "master_platform" {
description = "Platform for Slurm Master."
type = string
description = "Platform for workers: gpu-h100-b for Inspur or gpu-h100 for Gigabyte"
default = "gpu-h100-b"
default = "cpu-e2"
}

variable "master_preset" {
description = "Preset for Slurm Master."
type = string
default = "4vcpu-16gb"
}

variable "worker_platform" {
description = "Platform for Slurm Worker."
type = string
default = "gpu-h100-sxm"
}

variable "worker_preset" {
description = "Preset for Slurm Worker."
type = string
default = "8gpu-128vcpu-1600gb"
}

variable "mysql_jobs_backend" {
type = bool
Expand Down
4 changes: 2 additions & 2 deletions wireguard/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "wireguard_instance" {
]

resources = {
platform = "cpu-e2"
preset = "4vcpu-16gb"
platform = var.platform
preset = var.preset
}


Expand Down
3 changes: 3 additions & 0 deletions wireguard/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@
# path = "put path to ssh key here"
# }
# public_ip_allocation_id = ""

platform = cpu-e2
preset = "4vcpu-16gb"
16 changes: 15 additions & 1 deletion wireguard/variables.tf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# K8s cluster
# Global parameters
variable "parent_id" {
description = "Project ID."
type = string
Expand All @@ -9,6 +9,20 @@ variable "subnet_id" {
type = string
}


# Platform
variable "platform" {
description = "Platform for WireGuard host."
type = string
default = "cpu-e2"
}

variable "preset" {
description = "Preset for WireGuard host."
type = string
default = "4vcpu-16gb"
}

# SSH access
variable "ssh_user_name" {
description = "SSH username."
Expand Down
Loading