From 8791f52ef466ce1b439490fb5abfe11304f0266b Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 12:51:21 +0200 Subject: [PATCH 01/22] Platform and preset moved to variables across the library; --- k8s-inference/terraform.tfvars | 2 ++ k8s-training/terraform.tfvars | 3 +++ slurm/slurm-master.tf | 4 ++-- slurm/slurm-worker.tf | 4 ++-- slurm/terraform.tfvars | 5 +++++ slurm/variables.tf | 23 ++++++++++++++++++++--- wireguard/main.tf | 4 ++-- wireguard/terraform.tfvars | 3 +++ wireguard/variables.tf | 16 +++++++++++++++- 9 files changed, 54 insertions(+), 10 deletions(-) diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 7ff9869d..00ac6317 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -10,7 +10,9 @@ # K8s modes cpu_nodes_count = 1 # Number of CPU nodes cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset +cpu_nodes_platform = "cpu-e2" # The CPU node platform gpu_nodes_count = 1 # Number of GPU nodes +gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs. # Observability diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index 5392c93f..63eb2d40 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -9,9 +9,12 @@ # K8s modes cpu_nodes_count = 1 # Number of CPU nodes +cpu_nodes_platform = "cpu-e2" # The CPU node platform cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset gpu_nodes_count = 1 # Number of GPU nodes +gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection +infiniband_fabric = "fabric-3" # Infiniband fabric name. # Observability diff --git a/slurm/slurm-master.tf b/slurm/slurm-master.tf index 26f3075b..cc34894f 100644 --- a/slurm/slurm-master.tf +++ b/slurm/slurm-master.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "master" { name = "slurm-master" parent_id = var.parent_id resources = { - platform = "cpu-e2" - preset = "4vcpu-16gb" + platform = var.master_platform + preset = var.master_preset } boot_disk = { attach_mode = "READ_WRITE" diff --git a/slurm/slurm-worker.tf b/slurm/slurm-worker.tf index c1825c4f..21317552 100644 --- a/slurm/slurm-worker.tf +++ b/slurm/slurm-worker.tf @@ -29,8 +29,8 @@ resource "nebius_compute_v1_instance" "worker" { name = each.key parent_id = var.parent_id resources = { - platform = "gpu-h100-sxm" - preset = "8gpu-128vcpu-1600gb" + platform = var.worker_platform + preset = var.worker_preset } gpu_cluster = nebius_compute_v1_gpu_cluster.gpu-cluster-slurm diff --git a/slurm/terraform.tfvars b/slurm/terraform.tfvars index 1858faef..5db8a9f0 100644 --- a/slurm/terraform.tfvars +++ b/slurm/terraform.tfvars @@ -7,3 +7,8 @@ shared_fs_type = "filesystem" # "nfs" or "filesystem" # key = "put your public ssh key here" # path = "put path to ssh key here" # } + +master_platform = "cpu-e2" +master_preset = "4vcpu-16gb" +worker_platform = "gpu-h100-sxm" +worker_preset = "8gpu-128vcpu-1600gb" \ No newline at end of file diff --git a/slurm/variables.tf b/slurm/variables.tf index 565a04fc..8a957eb7 100644 --- a/slurm/variables.tf +++ b/slurm/variables.tf @@ -29,12 +29,29 @@ variable "ssh_public_key" { } } -variable "platform_id" { +variable "master_platform" { + description = "Platform for Slurm Master." type = string - description = "Platform for workers: gpu-h100-b for Inspur or gpu-h100 for Gigabyte" - default = "gpu-h100-b" + default = "cpu-e2" } +variable "master_preset" { + description = "Preset for Slurm Master." + type = string + default = "4vcpu-16gb" +} + +variable "worker_platform" { + description = "Platform for Slurm Worker." + type = string + default = "gpu-h100-sxm" +} + +variable "worker_preset" { + description = "Preset for Slurm Worker." + type = string + default = "8gpu-128vcpu-1600gb" +} variable "mysql_jobs_backend" { type = bool diff --git a/wireguard/main.tf b/wireguard/main.tf index 4bc7ba4d..ebc8e482 100644 --- a/wireguard/main.tf +++ b/wireguard/main.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "wireguard_instance" { ] resources = { - platform = "cpu-e2" - preset = "4vcpu-16gb" + platform = var.platform + preset = var.preset } diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 38986d48..76deadb8 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -6,3 +6,6 @@ # path = "put path to ssh key here" # } # public_ip_allocation_id = "" + +platform = cpu-e2 +preset = "4vcpu-16gb" \ No newline at end of file diff --git a/wireguard/variables.tf b/wireguard/variables.tf index 8d14ea3b..e4d74af9 100644 --- a/wireguard/variables.tf +++ b/wireguard/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global parameters variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,20 @@ variable "subnet_id" { type = string } + +# Platform +variable "platform" { + description = "Platform for WireGuard host." + type = string + default = "cpu-e2" +} + +variable "preset" { + description = "Preset for WireGuard host." + type = string + default = "4vcpu-16gb" +} + # SSH access variable "ssh_user_name" { description = "SSH username." From 04ab264986e2c20b8dd4c6eab8679bd1bc4f58bc Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 14:31:43 +0200 Subject: [PATCH 02/22] Added "region" variable to control platform defaults (k8s-inference); --- k8s-inference/README.md | 1 + k8s-inference/helm.tf | 2 +- k8s-inference/locals.tf | 22 ++++++++++++++++++++++ k8s-inference/main.tf | 8 ++++---- k8s-inference/terraform.tfvars | 7 ++----- k8s-inference/variables.tf | 16 +++++++++++----- 6 files changed, 41 insertions(+), 15 deletions(-) diff --git a/k8s-inference/README.md b/k8s-inference/README.md index a57220b4..42bfb756 100644 --- a/k8s-inference/README.md +++ b/k8s-inference/README.md @@ -75,6 +75,7 @@ There are additional configurable variables in `variables.tf`. # Cloud environment and network parent_id = "" # The project-id in this context subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +region = "" # The project region. ssh_user_name = "" # Username you want to use to connect to the nodes ssh_public_key = { key = "put your public ssh key here" OR diff --git a/k8s-inference/helm.tf b/k8s-inference/helm.tf index 41ca13e2..1e09c23b 100644 --- a/k8s-inference/helm.tf +++ b/k8s-inference/helm.tf @@ -30,7 +30,7 @@ module "o11y" { enabled = var.enable_dcgm, node_groups = { node_group_name = { - gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0]) + gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0]) instance_group_id = nebius_mk8s_v1_node_group.gpu.id } } diff --git a/k8s-inference/locals.tf b/k8s-inference/locals.tf index a4fb0a61..846e8972 100644 --- a/k8s-inference/locals.tf +++ b/k8s-inference/locals.tf @@ -2,6 +2,28 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + } + eu-north1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset) } resource "random_string" "random" { diff --git a/k8s-inference/main.tf b/k8s-inference/main.tf index 52041518..c2595ef8 100644 --- a/k8s-inference/main.tf +++ b/k8s-inference/main.tf @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" { } ] resources = { - platform = var.cpu_nodes_platform - preset = var.cpu_nodes_preset + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } filesystems = var.enable_filestore ? [ { @@ -73,8 +73,8 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } ] resources = { - platform = var.gpu_nodes_platform - preset = var.gpu_nodes_preset + platform = local.gpu_nodes_platform + preset = local.gpu_nodes_preset } filesystems = var.enable_filestore ? [ { diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 00ac6317..4250a903 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -1,19 +1,16 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" # } -# K8s modes +# K8s nodes cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset -cpu_nodes_platform = "cpu-e2" # The CPU node platform gpu_nodes_count = 1 # Number of GPU nodes -gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform -gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs. # Observability enable_grafana = true # Enable or disable Grafana deployment with true or false diff --git a/k8s-inference/variables.tf b/k8s-inference/variables.tf index ec1a994d..1087a015 100644 --- a/k8s-inference/variables.tf +++ b/k8s-inference/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,12 @@ variable "subnet_id" { type = string } +variable "region" { + description = "The current region." + type = string +} + +# K8s cluster variable "k8s_version" { description = "Kubernetes version to be used in the cluster." type = string @@ -114,13 +120,13 @@ variable "cpu_nodes_count" { variable "cpu_nodes_platform" { description = "Platform for nodes in the CPU-only node group." type = string - default = "cpu-e2" + default = null } variable "cpu_nodes_preset" { description = "CPU and RAM configuration for nodes in the CPU-only node group." type = string - default = "16vcpu-64gb" + default = null } variable "cpu_disk_type" { @@ -145,13 +151,13 @@ variable "gpu_nodes_count" { variable "gpu_nodes_platform" { description = "Platform for nodes in the GPU node group." type = string - default = "gpu-h100-sxm" + default = null } variable "gpu_nodes_preset" { description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group." type = string - default = "1gpu-16vcpu-200gb" + default = null } variable "gpu_disk_type" { From 5eed389c407bc23f876fff18c530abc5e4f57d60 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:12:23 +0200 Subject: [PATCH 03/22] Added "region" variable to control platform defaults (k8s-inference (2)); --- k8s-inference/locals.tf | 6 +++--- k8s-inference/terraform.tfvars | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/k8s-inference/locals.tf b/k8s-inference/locals.tf index 846e8972..b1c34f2c 100644 --- a/k8s-inference/locals.tf +++ b/k8s-inference/locals.tf @@ -5,13 +5,13 @@ locals { regions_default = { eu-west1 = { - cpu_nodes_platform = "cpu-e2" + cpu_nodes_platform = "cpu-d3" cpu_nodes_preset = "16vcpu-64gb" - gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_platform = "gpu-h200-sxm" gpu_nodes_preset = "1gpu-16vcpu-200gb" } eu-north1 = { - cpu_nodes_platform = "cpu-d3" + cpu_nodes_platform = "cpu-e2" cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h100-sxm" gpu_nodes_preset = "1gpu-16vcpu-200gb" diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 4250a903..7df94830 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -11,6 +11,10 @@ # K8s nodes cpu_nodes_count = 1 # Number of CPU nodes gpu_nodes_count = 1 # Number of GPU nodes +# cpu_nodes_platform = # CPU nodes platofm +# cpu_nodes_preset = # CPU nodes preset +# gpu_nodes_platform = # GPU nodes platform +# gpu_nodes_preset = # GPU nodes preset # Observability enable_grafana = true # Enable or disable Grafana deployment with true or false From 1984342fa7346043f78fae0446555af50abc7377 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:25:03 +0200 Subject: [PATCH 04/22] Added "region" variable to control platform defaults (k8s-inference (3)); --- k8s-inference/terraform.tfvars | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 7df94830..c38651d3 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -1,8 +1,8 @@ # Cloud environment and network -# parent_id = "" # The project-id in this context -# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id # region = "" -# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" From 3c8423a4be97b475abbed8e766cc5a8488c0dd49 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:25:16 +0200 Subject: [PATCH 05/22] Added "region" variable to control platform defaults (k8s-training); --- k8s-training/README.md | 1 + k8s-training/gpu_cluster.tf | 4 ++-- k8s-training/helm.tf | 2 +- k8s-training/locals.tf | 25 +++++++++++++++++++++++++ k8s-training/main.tf | 8 ++++---- k8s-training/terraform.tfvars | 17 +++++++++-------- k8s-training/variables.tf | 18 ++++++++++++------ 7 files changed, 54 insertions(+), 21 deletions(-) diff --git a/k8s-training/README.md b/k8s-training/README.md index 7de0fd2a..1c62e18a 100644 --- a/k8s-training/README.md +++ b/k8s-training/README.md @@ -84,6 +84,7 @@ Additional configurable variables can be found in the `variables.tf` file. # Cloud environment and network parent_id = "" # The project-id in this context subnet_id = "" # Run the `nebius vpc v1alpha1 network list` command to see the subnet id +region = "" # The project region ssh_user_name = "" # Username you want to use to connect to the nodes ssh_public_key = { key = "Enter your public SSH key here" OR diff --git a/k8s-training/gpu_cluster.tf b/k8s-training/gpu_cluster.tf index 89cce8c5..472d950f 100644 --- a/k8s-training/gpu_cluster.tf +++ b/k8s-training/gpu_cluster.tf @@ -1,5 +1,5 @@ resource "nebius_compute_v1_gpu_cluster" "fabric_2" { - infiniband_fabric = var.infiniband_fabric + infiniband_fabric = local.infiniband_fabric parent_id = var.parent_id - name = join("-", [var.infiniband_fabric, local.release-suffix]) + name = join("-", [local.infiniband_fabric, local.release-suffix]) } diff --git a/k8s-training/helm.tf b/k8s-training/helm.tf index 1bf3755f..6bceeef5 100644 --- a/k8s-training/helm.tf +++ b/k8s-training/helm.tf @@ -39,7 +39,7 @@ module "o11y" { enabled = var.enable_dcgm, node_groups = { node_group_name = { - gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0]) + gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0]) instance_group_id = nebius_mk8s_v1_node_group.gpu.id } } diff --git a/k8s-training/locals.tf b/k8s-training/locals.tf index a4fb0a61..32dd99c5 100644 --- a/k8s-training/locals.tf +++ b/k8s-training/locals.tf @@ -2,6 +2,31 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + infiniband_fabric = "fabric-5" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + infiniband_fabric = "fabric-3" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset) + infiniband_fabric = coalesce(var.infiniband_fabric, local.current_region_defaults.infiniband_fabric) } resource "random_string" "random" { diff --git a/k8s-training/main.tf b/k8s-training/main.tf index e877161f..a5b27e05 100644 --- a/k8s-training/main.tf +++ b/k8s-training/main.tf @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" { } ] resources = { - platform = var.cpu_nodes_platform - preset = var.cpu_nodes_preset + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } filesystems = var.enable_filestore ? [ { @@ -73,8 +73,8 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } ] resources = { - platform = var.gpu_nodes_platform - preset = var.gpu_nodes_preset + platform = local.gpu_nodes_platform + preset = local.gpu_nodes_preset } filesystems = var.enable_filestore ? [ { diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index 63eb2d40..8d650ae6 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -1,20 +1,21 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" # } -# K8s modes -cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_platform = "cpu-e2" # The CPU node platform -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset -gpu_nodes_count = 1 # Number of GPU nodes -gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform -gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection -infiniband_fabric = "fabric-3" # Infiniband fabric name. +# K8s nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes +# cpu_nodes_platform = # CPU nodes platofm +# cpu_nodes_preset = # CPU nodes preset +# gpu_nodes_platform = # GPU nodes platform +# gpu_nodes_preset = # GPU nodes preset +# infiniband_fabric = # Infiniband fabric name. # Observability diff --git a/k8s-training/variables.tf b/k8s-training/variables.tf index b17ba14b..65f6b71c 100644 --- a/k8s-training/variables.tf +++ b/k8s-training/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,12 @@ variable "subnet_id" { type = string } +variable "region" { + description = "The current region." + type = string +} + +# K8s cluster variable "k8s_version" { description = "Kubernetes version to be used in the cluster." type = string @@ -114,13 +120,13 @@ variable "cpu_nodes_count" { variable "cpu_nodes_platform" { description = "Platform for nodes in the CPU-only node group." type = string - default = "cpu-e2" + default = null } variable "cpu_nodes_preset" { description = "CPU and RAM configuration for nodes in the CPU-only node group." type = string - default = "16vcpu-64gb" + default = null } variable "cpu_disk_type" { @@ -145,13 +151,13 @@ variable "gpu_nodes_count" { variable "gpu_nodes_platform" { description = "Platform for nodes in the GPU node group." type = string - default = "gpu-h100-sxm" + default = null } variable "gpu_nodes_preset" { description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group." type = string - default = "8gpu-128vcpu-1600gb" + default = null } variable "gpu_disk_type" { @@ -169,7 +175,7 @@ variable "gpu_disk_size" { variable "infiniband_fabric" { description = "Infiniband's fabric name." type = string - default = "fabric-3" + default = null } variable "gpu_nodes_assign_public_ip" { From ab7b2deef6784bc801f1210d2e3b9fc6f396316d Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:32:49 +0200 Subject: [PATCH 06/22] Added "region" variable to control platform defaults (GlusterFS module); --- k8s-inference/gluster-fs.tf | 2 ++ k8s-training/gluster-fs.tf | 2 ++ 2 files changed, 4 insertions(+) diff --git a/k8s-inference/gluster-fs.tf b/k8s-inference/gluster-fs.tf index 6a87b0e8..8d800f53 100644 --- a/k8s-inference/gluster-fs.tf +++ b/k8s-inference/gluster-fs.tf @@ -7,4 +7,6 @@ module "glusterfs" { disk_count_per_vm = var.glusterfs_disk_count_per_vm disk_size = var.glusterfs_disk_size ssh_public_key = local.ssh_public_key + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/k8s-training/gluster-fs.tf b/k8s-training/gluster-fs.tf index 6a87b0e8..8d800f53 100644 --- a/k8s-training/gluster-fs.tf +++ b/k8s-training/gluster-fs.tf @@ -7,4 +7,6 @@ module "glusterfs" { disk_count_per_vm = var.glusterfs_disk_count_per_vm disk_size = var.glusterfs_disk_size ssh_public_key = local.ssh_public_key + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } From 5216c7a2ab4ac5356095e0f35c0b8ef8177e9f6f Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:38:19 +0200 Subject: [PATCH 07/22] Added "region" variable to control platform defaults (NFS Server); --- nfs-server/locals.tf | 16 ++++++++++++++++ nfs-server/main.tf | 2 ++ nfs-server/{nfs.tfvars => terraform.tfvars} | 1 + nfs-server/variables.tf | 17 +++++++++++++++++ 4 files changed, 36 insertions(+) rename nfs-server/{nfs.tfvars => terraform.tfvars} (88%) diff --git a/nfs-server/locals.tf b/nfs-server/locals.tf index ed79b470..2036a642 100644 --- a/nfs-server/locals.tf +++ b/nfs-server/locals.tf @@ -1,4 +1,20 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) } diff --git a/nfs-server/main.tf b/nfs-server/main.tf index 10d495ac..39085d14 100644 --- a/nfs-server/main.tf +++ b/nfs-server/main.tf @@ -9,4 +9,6 @@ module "nfs-module" { ssh_public_key = var.ssh_public_key.key nfs_ip_range = var.nfs_ip_range nfs_size = var.nfs_size + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/nfs-server/nfs.tfvars b/nfs-server/terraform.tfvars similarity index 88% rename from nfs-server/nfs.tfvars rename to nfs-server/terraform.tfvars index 275cf081..d1f874e1 100644 --- a/nfs-server/nfs.tfvars +++ b/nfs-server/terraform.tfvars @@ -1,5 +1,6 @@ parent_id = "project-..." subnet_id = "vpcsubnet-..." +region = "eu-north1" ssh_user_name = "nfs" ssh_public_key = { key = "put your ssh key here" diff --git a/nfs-server/variables.tf b/nfs-server/variables.tf index 369cefba..d04c7ed3 100644 --- a/nfs-server/variables.tf +++ b/nfs-server/variables.tf @@ -8,6 +8,23 @@ variable "subnet_id" { description = "ID of the subnet." } +variable "region" { + type = string + description = "Project region." +} + +variable "cpu_nodes_platform" { + description = "Platform for instances." + type = string + default = null +} + +variable "cpu_nodes_preset" { + description = "CPU and RAM configuration for instances." + type = string + default = null +} + variable "nfs_size" { type = number default = 93 * 1024 * 1024 * 1024 # size should be a multiple of 99857989632 From e704aad4e9fb946b4e02d38c6163dbbe10a23705 Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 14:44:31 +0100 Subject: [PATCH 08/22] Tf fmt --- k8s-inference/locals.tf | 2 +- k8s-inference/main.tf | 2 +- k8s-inference/terraform.tfvars | 4 ++-- k8s-training/locals.tf | 4 ++-- k8s-training/main.tf | 2 +- k8s-training/terraform.tfvars | 10 +++++----- wireguard/terraform.tfvars | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/k8s-inference/locals.tf b/k8s-inference/locals.tf index b1c34f2c..4edf97ef 100644 --- a/k8s-inference/locals.tf +++ b/k8s-inference/locals.tf @@ -2,7 +2,7 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) - + regions_default = { eu-west1 = { cpu_nodes_platform = "cpu-d3" diff --git a/k8s-inference/main.tf b/k8s-inference/main.tf index c2595ef8..76f605c8 100644 --- a/k8s-inference/main.tf +++ b/k8s-inference/main.tf @@ -68,7 +68,7 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } network_interfaces = [ { - subnet_id = var.subnet_id + subnet_id = var.subnet_id public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index c38651d3..b9509541 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -9,8 +9,8 @@ # } # K8s nodes -cpu_nodes_count = 1 # Number of CPU nodes -gpu_nodes_count = 1 # Number of GPU nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes # cpu_nodes_platform = # CPU nodes platofm # cpu_nodes_preset = # CPU nodes preset # gpu_nodes_platform = # GPU nodes platform diff --git a/k8s-training/locals.tf b/k8s-training/locals.tf index 32dd99c5..940172f7 100644 --- a/k8s-training/locals.tf +++ b/k8s-training/locals.tf @@ -9,14 +9,14 @@ locals { cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h200-sxm" gpu_nodes_preset = "1gpu-16vcpu-200gb" - infiniband_fabric = "fabric-5" + infiniband_fabric = "fabric-5" } eu-north1 = { cpu_nodes_platform = "cpu-e2" cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h100-sxm" gpu_nodes_preset = "1gpu-16vcpu-200gb" - infiniband_fabric = "fabric-3" + infiniband_fabric = "fabric-3" } } diff --git a/k8s-training/main.tf b/k8s-training/main.tf index a5b27e05..869a1b72 100644 --- a/k8s-training/main.tf +++ b/k8s-training/main.tf @@ -68,7 +68,7 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } network_interfaces = [ { - subnet_id = var.subnet_id + subnet_id = var.subnet_id public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index 8d650ae6..bf2a7f7f 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -9,8 +9,8 @@ # } # K8s nodes -cpu_nodes_count = 1 # Number of CPU nodes -gpu_nodes_count = 1 # Number of GPU nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes # cpu_nodes_platform = # CPU nodes platofm # cpu_nodes_preset = # CPU nodes preset # gpu_nodes_platform = # GPU nodes platform @@ -19,10 +19,10 @@ gpu_nodes_count = 1 # Number of GPU nodes # Observability -enable_grafana = true # Enable or disable Grafana deployment with true or false -enable_prometheus = true # Enable or disable Prometheus deployment with true or false +enable_grafana = true # Enable or disable Grafana deployment with true or false +enable_prometheus = true # Enable or disable Prometheus deployment with true or false enable_loki = false # Enable or disable Loki deployment with true or false -enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false +enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false ## Loki # loki_access_key_id = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki. diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 76deadb8..24c95c32 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -8,4 +8,4 @@ # public_ip_allocation_id = "" platform = cpu-e2 -preset = "4vcpu-16gb" \ No newline at end of file +preset = "4vcpu-16gb" \ No newline at end of file From 3f3ce80bfe85c327e80f6de9389e3535eb472ea7 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:50:58 +0200 Subject: [PATCH 09/22] Tests fixed; --- k8s-inference/tests/main.tftest.hcl | 3 +++ k8s-training/tests/k8s-training-kuberay.tftest.hcl | 3 +++ k8s-training/tests/main.tftest.hcl | 2 ++ 3 files changed, 8 insertions(+) diff --git a/k8s-inference/tests/main.tftest.hcl b/k8s-inference/tests/main.tftest.hcl index 040a2316..b9af681f 100644 --- a/k8s-inference/tests/main.tftest.hcl +++ b/k8s-inference/tests/main.tftest.hcl @@ -6,6 +6,7 @@ run "k8s_inference_apply" { ] } variables { + region = "eu-north1" etcd_cluster_size = 1 } } @@ -26,6 +27,7 @@ run "k8s_node_groups_inference_apply" { run "full_inference_apply" { command = apply variables { + region = "eu-north1" etcd_cluster_size = 1 } } @@ -34,6 +36,7 @@ run "test_mode_k8s_inference_apply" { command = apply variables { + region = "eu-north1" etcd_cluster_size = 1 test_mode = true } diff --git a/k8s-training/tests/k8s-training-kuberay.tftest.hcl b/k8s-training/tests/k8s-training-kuberay.tftest.hcl index af0d21d1..288be471 100644 --- a/k8s-training/tests/k8s-training-kuberay.tftest.hcl +++ b/k8s-training/tests/k8s-training-kuberay.tftest.hcl @@ -6,6 +6,7 @@ run "k8s_training_kuberay_apply" { ] } variables { + region = "eu-north1" etcd_cluster_size = 1 } } @@ -19,6 +20,7 @@ run "k8s_node_groups_training_kuberay_apply" { ] } variables { + region = "eu-north1" etcd_cluster_size = 1 } } @@ -27,6 +29,7 @@ run "full_training_kuberay_apply" { command = apply variables { + region = "eu-north1" etcd_cluster_size = 1 enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket enable_kuberay = true diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index 1f204bd3..c77d108c 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -21,6 +21,7 @@ run "full_training_apply" { command = apply variables { + region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket } } @@ -29,6 +30,7 @@ run "test_mode_k8s_training_apply" { command = apply variables { + region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket test_mode = true } From d5512e22c2c0aa434d935e484509d83890a4cbc8 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:53:12 +0200 Subject: [PATCH 10/22] Tf fmt; --- nfs-server/locals.tf | 2 +- nfs-server/variables.tf | 2 +- slurm/tests/main.tftest.hcl | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/nfs-server/locals.tf b/nfs-server/locals.tf index 2036a642..cba34bbe 100644 --- a/nfs-server/locals.tf +++ b/nfs-server/locals.tf @@ -2,7 +2,7 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) - regions_default = { + regions_default = { eu-west1 = { cpu_nodes_platform = "cpu-d3" cpu_nodes_preset = "16vcpu-64gb" diff --git a/nfs-server/variables.tf b/nfs-server/variables.tf index d04c7ed3..a83f100f 100644 --- a/nfs-server/variables.tf +++ b/nfs-server/variables.tf @@ -9,7 +9,7 @@ variable "subnet_id" { } variable "region" { - type = string + type = string description = "Project region." } diff --git a/slurm/tests/main.tftest.hcl b/slurm/tests/main.tftest.hcl index 6847e79c..fac982af 100644 --- a/slurm/tests/main.tftest.hcl +++ b/slurm/tests/main.tftest.hcl @@ -2,6 +2,7 @@ run "slurm_master_apply" { command = apply variables { + region = "eu-north1" cluster_workers_count = 2 } @@ -16,6 +17,7 @@ run "slurm_full_apply" { command = apply variables { + region = "eu-north1" cluster_workers_count = 2 } } @@ -24,6 +26,7 @@ run "test_mode_slurm_apply" { command = apply variables { + region = "eu-north1" cluster_workers_count = 2 test_mode = true } From 2991b9ec950c4d41c0f5210a0d7977418690e87e Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:08:30 +0200 Subject: [PATCH 11/22] Tests fixed (2); --- k8s-inference/tests/main.tftest.hcl | 1 + 1 file changed, 1 insertion(+) diff --git a/k8s-inference/tests/main.tftest.hcl b/k8s-inference/tests/main.tftest.hcl index b9af681f..df217ad4 100644 --- a/k8s-inference/tests/main.tftest.hcl +++ b/k8s-inference/tests/main.tftest.hcl @@ -20,6 +20,7 @@ run "k8s_node_groups_inference_apply" { ] } variables { + region = "eu-north1" etcd_cluster_size = 1 } } From 1948cca5343c6d01cc0c70837df4c7cf5f7c9370 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:16:37 +0200 Subject: [PATCH 12/22] Added "region" variable to control platform defaults (WireGuard); --- wireguard/locals.tf | 17 +++++++++++++++++ wireguard/terraform.tfvars | 12 +++++------- wireguard/tests/main.tftest.hcl | 1 + wireguard/variables.tf | 5 +++++ 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/wireguard/locals.tf b/wireguard/locals.tf index ed79b470..8c7a63f2 100644 --- a/wireguard/locals.tf +++ b/wireguard/locals.tf @@ -1,4 +1,21 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + platform = "cpu-d3" + preset = "16vcpu-64gb" + } + eu-north1 = { + platform = "cpu-e2" + preset = "16vcpu-64gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + platform = coalesce(var.platform, local.current_region_defaults.platform) + preset = coalesce(var.preset, local.current_region_defaults.preset) + } diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 24c95c32..2ca36082 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -1,11 +1,9 @@ -# parent_id = "" -# subnet_id = "" -# ssh_user_name = "ubuntu" +# parent_id = "" +# subnet_id = "" +# region = "eu-west1" +ssh_user_name = "ubuntu" # ssh_public_key = { # key = "put your public ssh key here" # path = "put path to ssh key here" # } -# public_ip_allocation_id = "" - -platform = cpu-e2 -preset = "4vcpu-16gb" \ No newline at end of file +# public_ip_allocation_id = "" \ No newline at end of file diff --git a/wireguard/tests/main.tftest.hcl b/wireguard/tests/main.tftest.hcl index f8ebc7af..f9d99353 100644 --- a/wireguard/tests/main.tftest.hcl +++ b/wireguard/tests/main.tftest.hcl @@ -6,6 +6,7 @@ run "test_mode_wireguard_apply" { command = apply variables { + region = "eu-north1" test_mode = true } } diff --git a/wireguard/variables.tf b/wireguard/variables.tf index e4d74af9..f05a5a33 100644 --- a/wireguard/variables.tf +++ b/wireguard/variables.tf @@ -9,6 +9,11 @@ variable "subnet_id" { type = string } +variable "region" { + description = "Project region." + type = string +} + # Platform variable "platform" { From 157adb92649c76f57ce7b0566669335a329f2747 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:16:42 +0200 Subject: [PATCH 13/22] Added "region" variable to control platform defaults (Slurm); --- slurm/locals.tf | 23 +++++++++++++++++++++++ slurm/nfs.tf | 2 ++ slurm/slurm-master.tf | 4 ++-- slurm/slurm-worker.tf | 4 ++-- slurm/terraform.tfvars | 8 ++------ slurm/variables.tf | 13 +++++++++---- 6 files changed, 40 insertions(+), 14 deletions(-) diff --git a/slurm/locals.tf b/slurm/locals.tf index ed79b470..1981d5b5 100644 --- a/slurm/locals.tf +++ b/slurm/locals.tf @@ -1,4 +1,27 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + + regions_default = { + eu-west1 = { + master_platform = "cpu-d3" + master_preset = "16vcpu-64gb" + worker_platform = "gpu-h200-sxm" + worker_preset = "1gpu-16vcpu-200gb" + } + eu-north1 = { + master_platform = "cpu-e2" + master_preset = "16vcpu-64gb" + worker_platform = "gpu-h100-sxm" + worker_preset = "1gpu-16vcpu-200gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + master_platform = coalesce(var.master_platform, local.current_region_defaults.master_platform) + master_preset = coalesce(var.master_preset, local.current_region_defaults.master_preset) + worker_platform = coalesce(var.worker_platform, local.current_region_defaults.worker_platform) + worker_preset = coalesce(var.worker_preset, local.current_region_defaults.worker_preset) } diff --git a/slurm/nfs.tf b/slurm/nfs.tf index 552ab197..d0fe16c8 100644 --- a/slurm/nfs.tf +++ b/slurm/nfs.tf @@ -10,4 +10,6 @@ module "nfs-module" { ssh_public_key = local.ssh_public_key nfs_ip_range = "192.168.0.0/16" nfs_size = var.fs_size + platform = local.master_platform + preset = local.master_preset } diff --git a/slurm/slurm-master.tf b/slurm/slurm-master.tf index cc34894f..dda58492 100644 --- a/slurm/slurm-master.tf +++ b/slurm/slurm-master.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "master" { name = "slurm-master" parent_id = var.parent_id resources = { - platform = var.master_platform - preset = var.master_preset + platform = local.master_platform + preset = local.master_preset } boot_disk = { attach_mode = "READ_WRITE" diff --git a/slurm/slurm-worker.tf b/slurm/slurm-worker.tf index 21317552..2bbfc0a6 100644 --- a/slurm/slurm-worker.tf +++ b/slurm/slurm-worker.tf @@ -29,8 +29,8 @@ resource "nebius_compute_v1_instance" "worker" { name = each.key parent_id = var.parent_id resources = { - platform = var.worker_platform - preset = var.worker_preset + platform = local.worker_platform + preset = local.worker_preset } gpu_cluster = nebius_compute_v1_gpu_cluster.gpu-cluster-slurm diff --git a/slurm/terraform.tfvars b/slurm/terraform.tfvars index 5db8a9f0..3d54369b 100644 --- a/slurm/terraform.tfvars +++ b/slurm/terraform.tfvars @@ -1,14 +1,10 @@ parent_id = "project-e00..." subnet_id = "vpcsubnet-e00..." +region = "" cluster_workers_count = 2 # amount of workers mysql_jobs_backend = false # Do you want to use mysql shared_fs_type = "filesystem" # "nfs" or "filesystem" # ssh_public_key = { # key = "put your public ssh key here" # path = "put path to ssh key here" -# } - -master_platform = "cpu-e2" -master_preset = "4vcpu-16gb" -worker_platform = "gpu-h100-sxm" -worker_preset = "8gpu-128vcpu-1600gb" \ No newline at end of file +# } \ No newline at end of file diff --git a/slurm/variables.tf b/slurm/variables.tf index 8a957eb7..f128c0d8 100644 --- a/slurm/variables.tf +++ b/slurm/variables.tf @@ -5,6 +5,11 @@ variable "subnet_id" { type = string } +variable "region" { + description = "Project region." + type = string +} + variable "ib_image_id" { type = string description = "ID of Infiniband image" @@ -32,25 +37,25 @@ variable "ssh_public_key" { variable "master_platform" { description = "Platform for Slurm Master." type = string - default = "cpu-e2" + default = null } variable "master_preset" { description = "Preset for Slurm Master." type = string - default = "4vcpu-16gb" + default = null } variable "worker_platform" { description = "Platform for Slurm Worker." type = string - default = "gpu-h100-sxm" + default = null } variable "worker_preset" { description = "Preset for Slurm Worker." type = string - default = "8gpu-128vcpu-1600gb" + default = null } variable "mysql_jobs_backend" { From 4a55aafd0916f2d95f2006433fdf7e70fbd85dd1 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:19:42 +0200 Subject: [PATCH 14/22] terraform.tfvars files refactored; --- k8s-inference/terraform.tfvars | 2 +- k8s-training/terraform.tfvars | 2 +- nfs-server/terraform.tfvars | 16 ++++++++-------- slurm/terraform.tfvars | 20 +++++++++++++------- wireguard/terraform.tfvars | 12 ++++++------ 5 files changed, 29 insertions(+), 23 deletions(-) diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index b9509541..35b76296 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -1,7 +1,7 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id -# region = "" +# region = "" # Project region # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index bf2a7f7f..f62dfe05 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -1,7 +1,7 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id -# region = "" +# region = "" # Project region # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR diff --git a/nfs-server/terraform.tfvars b/nfs-server/terraform.tfvars index d1f874e1..b9b7adf1 100644 --- a/nfs-server/terraform.tfvars +++ b/nfs-server/terraform.tfvars @@ -1,9 +1,9 @@ -parent_id = "project-..." -subnet_id = "vpcsubnet-..." -region = "eu-north1" -ssh_user_name = "nfs" -ssh_public_key = { - key = "put your ssh key here" - # path = "or put path to ssh key here" -} +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_public_key = { +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" +# } nfs_ip_range = "192.168.0.0/16" diff --git a/slurm/terraform.tfvars b/slurm/terraform.tfvars index 3d54369b..3c1ff8a4 100644 --- a/slurm/terraform.tfvars +++ b/slurm/terraform.tfvars @@ -1,10 +1,16 @@ -parent_id = "project-e00..." -subnet_id = "vpcsubnet-e00..." -region = "" +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_public_key = { +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" +# } cluster_workers_count = 2 # amount of workers mysql_jobs_backend = false # Do you want to use mysql shared_fs_type = "filesystem" # "nfs" or "filesystem" -# ssh_public_key = { -# key = "put your public ssh key here" -# path = "put path to ssh key here" -# } \ No newline at end of file + +# master_platform = +# master_preset = +# worker_platform = +# worker_preset = \ No newline at end of file diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 2ca36082..79c25e96 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -1,9 +1,9 @@ -# parent_id = "" -# subnet_id = "" -# region = "eu-west1" -ssh_user_name = "ubuntu" +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { -# key = "put your public ssh key here" -# path = "put path to ssh key here" +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" # } # public_ip_allocation_id = "" \ No newline at end of file From f4e08df7943912d3b9e19812e73ea252a5e0d5c1 Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 15:30:54 +0100 Subject: [PATCH 15/22] TF fmt --- k8s-training/tests/main.tftest.hcl | 4 ++-- slurm/tests/main.tftest.hcl | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index c77d108c..86026283 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -21,7 +21,7 @@ run "full_training_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket } } @@ -30,7 +30,7 @@ run "test_mode_k8s_training_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket test_mode = true } diff --git a/slurm/tests/main.tftest.hcl b/slurm/tests/main.tftest.hcl index fac982af..1f9367c9 100644 --- a/slurm/tests/main.tftest.hcl +++ b/slurm/tests/main.tftest.hcl @@ -2,7 +2,7 @@ run "slurm_master_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" cluster_workers_count = 2 } @@ -17,7 +17,7 @@ run "slurm_full_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" cluster_workers_count = 2 } } @@ -26,7 +26,7 @@ run "test_mode_slurm_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" cluster_workers_count = 2 test_mode = true } From 5a18cda3f7842a0c4528a2ba4435a417c1bc14cd Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 15:35:57 +0100 Subject: [PATCH 16/22] Added region variables for tests --- k8s-training/tests/main.tftest.hcl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index 86026283..fb00360b 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -1,5 +1,11 @@ run "k8s_training_apply" { command = apply + + variables { + region = "eu-north1" + enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket + } + plan_options { target = [ nebius_mk8s_v1_cluster.k8s-cluster @@ -9,6 +15,12 @@ run "k8s_training_apply" { run "k8s_node_groups_training_apply" { command = apply + + variables { + region = "eu-north1" + enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket + } + plan_options { target = [ nebius_mk8s_v1_node_group.cpu-only, From 26de35561e4a770d1b40cf5ac20831d66326d72a Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 15:46:13 +0100 Subject: [PATCH 17/22] Clean region variable for tests --- .github/workflows/terraform.yml | 1 + k8s-inference/tests/main.tftest.hcl | 4 ---- k8s-training/tests/k8s-training-kuberay.tftest.hcl | 3 --- k8s-training/tests/main.tftest.hcl | 12 ------------ slurm/tests/main.tftest.hcl | 3 --- 5 files changed, 1 insertion(+), 22 deletions(-) diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index 25cf059d..24e8a75a 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -39,6 +39,7 @@ jobs: env: TF_VAR_subnet_id: vpcsubnet-e00dgdntmhgkeej1z3 + TF_VAR_region: eu-north1 TF_VAR_loki_access_key_id: ${{ secrets.SA_ACCESS_KEY_ID }} TF_VAR_loki_secret_key: ${{ secrets.SA_SECRET_KEY }} diff --git a/k8s-inference/tests/main.tftest.hcl b/k8s-inference/tests/main.tftest.hcl index df217ad4..040a2316 100644 --- a/k8s-inference/tests/main.tftest.hcl +++ b/k8s-inference/tests/main.tftest.hcl @@ -6,7 +6,6 @@ run "k8s_inference_apply" { ] } variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -20,7 +19,6 @@ run "k8s_node_groups_inference_apply" { ] } variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -28,7 +26,6 @@ run "k8s_node_groups_inference_apply" { run "full_inference_apply" { command = apply variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -37,7 +34,6 @@ run "test_mode_k8s_inference_apply" { command = apply variables { - region = "eu-north1" etcd_cluster_size = 1 test_mode = true } diff --git a/k8s-training/tests/k8s-training-kuberay.tftest.hcl b/k8s-training/tests/k8s-training-kuberay.tftest.hcl index 288be471..af0d21d1 100644 --- a/k8s-training/tests/k8s-training-kuberay.tftest.hcl +++ b/k8s-training/tests/k8s-training-kuberay.tftest.hcl @@ -6,7 +6,6 @@ run "k8s_training_kuberay_apply" { ] } variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -20,7 +19,6 @@ run "k8s_node_groups_training_kuberay_apply" { ] } variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -29,7 +27,6 @@ run "full_training_kuberay_apply" { command = apply variables { - region = "eu-north1" etcd_cluster_size = 1 enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket enable_kuberay = true diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index fb00360b..14f99ed0 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -1,11 +1,6 @@ run "k8s_training_apply" { command = apply - variables { - region = "eu-north1" - enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket - } - plan_options { target = [ nebius_mk8s_v1_cluster.k8s-cluster @@ -16,11 +11,6 @@ run "k8s_training_apply" { run "k8s_node_groups_training_apply" { command = apply - variables { - region = "eu-north1" - enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket - } - plan_options { target = [ nebius_mk8s_v1_node_group.cpu-only, @@ -33,7 +23,6 @@ run "full_training_apply" { command = apply variables { - region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket } } @@ -42,7 +31,6 @@ run "test_mode_k8s_training_apply" { command = apply variables { - region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket test_mode = true } diff --git a/slurm/tests/main.tftest.hcl b/slurm/tests/main.tftest.hcl index 1f9367c9..6847e79c 100644 --- a/slurm/tests/main.tftest.hcl +++ b/slurm/tests/main.tftest.hcl @@ -2,7 +2,6 @@ run "slurm_master_apply" { command = apply variables { - region = "eu-north1" cluster_workers_count = 2 } @@ -17,7 +16,6 @@ run "slurm_full_apply" { command = apply variables { - region = "eu-north1" cluster_workers_count = 2 } } @@ -26,7 +24,6 @@ run "test_mode_slurm_apply" { command = apply variables { - region = "eu-north1" cluster_workers_count = 2 test_mode = true } From 1f266852fc4e1f3dfedbdf237d70f6d7e5702bf4 Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 15:48:05 +0100 Subject: [PATCH 18/22] Clean region variable for tests --- k8s-training/tests/main.tftest.hcl | 2 -- wireguard/tests/main.tftest.hcl | 1 - 2 files changed, 3 deletions(-) diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index 14f99ed0..1f204bd3 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -1,6 +1,5 @@ run "k8s_training_apply" { command = apply - plan_options { target = [ nebius_mk8s_v1_cluster.k8s-cluster @@ -10,7 +9,6 @@ run "k8s_training_apply" { run "k8s_node_groups_training_apply" { command = apply - plan_options { target = [ nebius_mk8s_v1_node_group.cpu-only, diff --git a/wireguard/tests/main.tftest.hcl b/wireguard/tests/main.tftest.hcl index f9d99353..f8ebc7af 100644 --- a/wireguard/tests/main.tftest.hcl +++ b/wireguard/tests/main.tftest.hcl @@ -6,7 +6,6 @@ run "test_mode_wireguard_apply" { command = apply variables { - region = "eu-north1" test_mode = true } } From 8d281b855ecba0c15d4fad5bdcc4e7985b690743 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:57:09 +0200 Subject: [PATCH 19/22] Added "region" variable to control platform defaults (GlusterFS (2)); --- modules/gluster-module/instances.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/gluster-module/instances.tf b/modules/gluster-module/instances.tf index 3c3450ea..ea057e77 100644 --- a/modules/gluster-module/instances.tf +++ b/modules/gluster-module/instances.tf @@ -14,8 +14,8 @@ resource "nebius_compute_v1_instance" "gluster-fs-instance" { } ] resources = { - platform = "cpu-e2" - preset = "16vcpu-64gb" + platform = var.platform + preset = var.preset } boot_disk = { From b7f9b4117bd8e383b2f732dbefe0440b9c986077 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 17:21:37 +0200 Subject: [PATCH 20/22] Presets fixed; --- k8s-training/locals.tf | 4 ++-- slurm/locals.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/k8s-training/locals.tf b/k8s-training/locals.tf index 940172f7..165efd27 100644 --- a/k8s-training/locals.tf +++ b/k8s-training/locals.tf @@ -8,14 +8,14 @@ locals { cpu_nodes_platform = "cpu-d3" cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h200-sxm" - gpu_nodes_preset = "1gpu-16vcpu-200gb" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" infiniband_fabric = "fabric-5" } eu-north1 = { cpu_nodes_platform = "cpu-e2" cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h100-sxm" - gpu_nodes_preset = "1gpu-16vcpu-200gb" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" infiniband_fabric = "fabric-3" } } diff --git a/slurm/locals.tf b/slurm/locals.tf index 1981d5b5..90fe2b56 100644 --- a/slurm/locals.tf +++ b/slurm/locals.tf @@ -8,13 +8,13 @@ locals { master_platform = "cpu-d3" master_preset = "16vcpu-64gb" worker_platform = "gpu-h200-sxm" - worker_preset = "1gpu-16vcpu-200gb" + worker_preset = "8gpu-128vcpu-1600gb" } eu-north1 = { master_platform = "cpu-e2" master_preset = "16vcpu-64gb" worker_platform = "gpu-h100-sxm" - worker_preset = "1gpu-16vcpu-200gb" + worker_preset = "8gpu-128vcpu-1600gb" } } From 398051a88cb78ce5314eb5e8e876701f6bac5960 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 18:25:27 +0200 Subject: [PATCH 21/22] Added "region" variable to control platform defaults (WireGuard (2)); --- wireguard/main.tf | 4 ++-- wireguard/variables.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/wireguard/main.tf b/wireguard/main.tf index ebc8e482..9b3abc99 100644 --- a/wireguard/main.tf +++ b/wireguard/main.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "wireguard_instance" { ] resources = { - platform = var.platform - preset = var.preset + platform = local.platform + preset = local.preset } diff --git a/wireguard/variables.tf b/wireguard/variables.tf index f05a5a33..1b0d7c9b 100644 --- a/wireguard/variables.tf +++ b/wireguard/variables.tf @@ -19,13 +19,13 @@ variable "region" { variable "platform" { description = "Platform for WireGuard host." type = string - default = "cpu-e2" + default = null } variable "preset" { description = "Preset for WireGuard host." type = string - default = "4vcpu-16gb" + default = null } # SSH access From 3a0ab6b18b924f0e204b033aa78b3a89f50eebd1 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 18:27:26 +0200 Subject: [PATCH 22/22] Tests fixed (3); --- k8s-training/applications.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s-training/applications.tf b/k8s-training/applications.tf index 3e84067a..d48326c5 100644 --- a/k8s-training/applications.tf +++ b/k8s-training/applications.tf @@ -12,8 +12,8 @@ module "kuberay" { parent_id = var.parent_id cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id - gpu_platform = var.gpu_nodes_platform - cpu_platform = var.cpu_nodes_platform + gpu_platform = local.gpu_nodes_platform + cpu_platform = local.cpu_nodes_platform min_gpu_replicas = var.kuberay_min_gpu_replicas max_gpu_replicas = var.kuberay_max_gpu_replicas }