Merge pull request #94 from nebius/feature/platform-and-preset-moved-…

…to-parameters Platform and preset moved to variables across the library;
nebius · Nov 25, 2024 · 88aab79 · 88aab79
2 parents 8584e20 + 3a0ab6b
commit 88aab79
Show file tree

Hide file tree

Showing 33 changed files with 275 additions and 75 deletions.
diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml
@@ -39,6 +39,7 @@ jobs:
 
     env:
       TF_VAR_subnet_id: vpcsubnet-e00dgdntmhgkeej1z3
+      TF_VAR_region: eu-north1
       TF_VAR_loki_access_key_id: ${{ secrets.SA_ACCESS_KEY_ID }}
       TF_VAR_loki_secret_key: ${{ secrets.SA_SECRET_KEY }}
 

diff --git a/k8s-inference/README.md b/k8s-inference/README.md
@@ -75,6 +75,7 @@ There are additional configurable variables in `variables.tf`.
 # Cloud environment and network
 parent_id      = "" # The project-id in this context
 subnet_id      = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
+region         = "" # The project region.
 ssh_user_name  = "" # Username you want to use to connect to the nodes
 ssh_public_key = {
   key  = "put your public ssh key here" OR

diff --git a/k8s-inference/gluster-fs.tf b/k8s-inference/gluster-fs.tf
@@ -7,4 +7,6 @@ module "glusterfs" {
   disk_count_per_vm = var.glusterfs_disk_count_per_vm
   disk_size         = var.glusterfs_disk_size
   ssh_public_key    = local.ssh_public_key
+  platform          = local.cpu_nodes_platform
+  preset            = local.cpu_nodes_preset
 }
diff --git a/k8s-inference/helm.tf b/k8s-inference/helm.tf
@@ -30,7 +30,7 @@ module "o11y" {
       enabled = var.enable_dcgm,
       node_groups = {
         node_group_name = {
-          gpus              = tonumber(split("gpu-", var.gpu_nodes_preset)[0])
+          gpus              = tonumber(split("gpu-", local.gpu_nodes_preset)[0])
           instance_group_id = nebius_mk8s_v1_node_group.gpu.id
         }
       }

diff --git a/k8s-inference/locals.tf b/k8s-inference/locals.tf
@@ -2,6 +2,28 @@ locals {
   release-suffix = random_string.random.result
   ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : (
   fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null)
+
+  regions_default = {
+    eu-west1 = {
+      cpu_nodes_platform = "cpu-d3"
+      cpu_nodes_preset   = "16vcpu-64gb"
+      gpu_nodes_platform = "gpu-h200-sxm"
+      gpu_nodes_preset   = "1gpu-16vcpu-200gb"
+    }
+    eu-north1 = {
+      cpu_nodes_platform = "cpu-e2"
+      cpu_nodes_preset   = "16vcpu-64gb"
+      gpu_nodes_platform = "gpu-h100-sxm"
+      gpu_nodes_preset   = "1gpu-16vcpu-200gb"
+    }
+  }
+
+  current_region_defaults = local.regions_default[var.region]
+
+  cpu_nodes_preset   = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset)
+  cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform)
+  gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform)
+  gpu_nodes_preset   = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset)
 }
 
 resource "random_string" "random" {

diff --git a/k8s-inference/main.tf b/k8s-inference/main.tf
@@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" {
       }
     ]
     resources = {
-      platform = var.cpu_nodes_platform
-      preset   = var.cpu_nodes_preset
+      platform = local.cpu_nodes_platform
+      preset   = local.cpu_nodes_preset
     }
     filesystems = var.enable_filestore ? [
       {
@@ -68,13 +68,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
     }
     network_interfaces = [
       {
-        subnet_id = var.subnet_id
+        subnet_id         = var.subnet_id
         public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null
       }
     ]
     resources = {
-      platform = var.gpu_nodes_platform
-      preset   = var.gpu_nodes_preset
+      platform = local.gpu_nodes_platform
+      preset   = local.gpu_nodes_preset
     }
     filesystems = var.enable_filestore ? [
       {

diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars
@@ -1,17 +1,20 @@
 # Cloud environment and network
-# parent_id      = ""                                                                         # The project-id in this context
-# subnet_id      = ""                                                                       # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
-# ssh_user_name  = ""                                                                                               # Username you want to use to connect to the nodes
+# parent_id      = "" # The project-id in this context
+# subnet_id      = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
+# region         = "" # Project region
+# ssh_user_name  = "" # Username you want to use to connect to the nodes
 # ssh_public_key = {
 # key  = "put your public ssh key here" OR
 # path = "put path to ssh key here"
 # }
 
-# K8s modes
-cpu_nodes_count  = 1                   # Number of CPU nodes
-cpu_nodes_preset = "16vcpu-64gb"       # The CPU node preset
-gpu_nodes_count  = 1                   # Number of GPU nodes
-gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs.
+# K8s nodes
+cpu_nodes_count = 1 # Number of CPU nodes
+gpu_nodes_count = 1 # Number of GPU nodes
+# cpu_nodes_platform =                 # CPU nodes platofm
+# cpu_nodes_preset   =                 # CPU nodes preset
+# gpu_nodes_platform =                 # GPU nodes platform
+# gpu_nodes_preset   =                 # GPU nodes preset
 
 # Observability
 enable_grafana    = true  # Enable or disable Grafana deployment with true or false

diff --git a/k8s-inference/variables.tf b/k8s-inference/variables.tf
@@ -1,4 +1,4 @@
-# K8s cluster
+# Global
 variable "parent_id" {
   description = "Project ID."
   type        = string
@@ -9,6 +9,12 @@ variable "subnet_id" {
   type        = string
 }
 
+variable "region" {
+  description = "The current region."
+  type        = string
+}
+
+# K8s cluster
 variable "k8s_version" {
   description = "Kubernetes version to be used in the cluster."
   type        = string
@@ -114,13 +120,13 @@ variable "cpu_nodes_count" {
 variable "cpu_nodes_platform" {
   description = "Platform for nodes in the CPU-only node group."
   type        = string
-  default     = "cpu-e2"
+  default     = null
 }
 
 variable "cpu_nodes_preset" {
   description = "CPU and RAM configuration for nodes in the CPU-only node group."
   type        = string
-  default     = "16vcpu-64gb"
+  default     = null
 }
 
 variable "cpu_disk_type" {
@@ -145,13 +151,13 @@ variable "gpu_nodes_count" {
 variable "gpu_nodes_platform" {
   description = "Platform for nodes in the GPU node group."
   type        = string
-  default     = "gpu-h100-sxm"
+  default     = null
 }
 
 variable "gpu_nodes_preset" {
   description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group."
   type        = string
-  default     = "1gpu-16vcpu-200gb"
+  default     = null
 }
 
 variable "gpu_disk_type" {

diff --git a/k8s-training/README.md b/k8s-training/README.md
@@ -84,6 +84,7 @@ Additional configurable variables can be found in the `variables.tf` file.
 # Cloud environment and network
 parent_id      = "" # The project-id in this context
 subnet_id      = "" # Run the `nebius vpc v1alpha1 network list` command to see the subnet id
+region         = "" # The project region
 ssh_user_name  = "" # Username you want to use to connect to the nodes
 ssh_public_key = {
   key  = "Enter your public SSH key here" OR

diff --git a/k8s-training/applications.tf b/k8s-training/applications.tf
@@ -12,8 +12,8 @@ module "kuberay" {
 
   parent_id        = var.parent_id
   cluster_id       = nebius_mk8s_v1_cluster.k8s-cluster.id
-  gpu_platform     = var.gpu_nodes_platform
-  cpu_platform     = var.cpu_nodes_platform
+  gpu_platform     = local.gpu_nodes_platform
+  cpu_platform     = local.cpu_nodes_platform
   min_gpu_replicas = var.kuberay_min_gpu_replicas
   max_gpu_replicas = var.kuberay_max_gpu_replicas
 }
diff --git a/k8s-training/gluster-fs.tf b/k8s-training/gluster-fs.tf
@@ -7,4 +7,6 @@ module "glusterfs" {
   disk_count_per_vm = var.glusterfs_disk_count_per_vm
   disk_size         = var.glusterfs_disk_size
   ssh_public_key    = local.ssh_public_key
+  platform          = local.cpu_nodes_platform
+  preset            = local.cpu_nodes_preset
 }
diff --git a/k8s-training/gpu_cluster.tf b/k8s-training/gpu_cluster.tf
@@ -1,5 +1,5 @@
 resource "nebius_compute_v1_gpu_cluster" "fabric_2" {
-  infiniband_fabric = var.infiniband_fabric
+  infiniband_fabric = local.infiniband_fabric
   parent_id         = var.parent_id
-  name              = join("-", [var.infiniband_fabric, local.release-suffix])
+  name              = join("-", [local.infiniband_fabric, local.release-suffix])
 }
diff --git a/k8s-training/helm.tf b/k8s-training/helm.tf
@@ -39,7 +39,7 @@ module "o11y" {
       enabled = var.enable_dcgm,
       node_groups = {
         node_group_name = {
-          gpus              = tonumber(split("gpu-", var.gpu_nodes_preset)[0])
+          gpus              = tonumber(split("gpu-", local.gpu_nodes_preset)[0])
           instance_group_id = nebius_mk8s_v1_node_group.gpu.id
         }
       }

diff --git a/k8s-training/locals.tf b/k8s-training/locals.tf
@@ -2,6 +2,31 @@ locals {
   release-suffix = random_string.random.result
   ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : (
   fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null)
+
+  regions_default = {
+    eu-west1 = {
+      cpu_nodes_platform = "cpu-d3"
+      cpu_nodes_preset   = "16vcpu-64gb"
+      gpu_nodes_platform = "gpu-h200-sxm"
+      gpu_nodes_preset   = "8gpu-128vcpu-1600gb"
+      infiniband_fabric  = "fabric-5"
+    }
+    eu-north1 = {
+      cpu_nodes_platform = "cpu-e2"
+      cpu_nodes_preset   = "16vcpu-64gb"
+      gpu_nodes_platform = "gpu-h100-sxm"
+      gpu_nodes_preset   = "8gpu-128vcpu-1600gb"
+      infiniband_fabric  = "fabric-3"
+    }
+  }
+
+  current_region_defaults = local.regions_default[var.region]
+
+  cpu_nodes_preset   = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset)
+  cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform)
+  gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform)
+  gpu_nodes_preset   = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset)
+  infiniband_fabric  = coalesce(var.infiniband_fabric, local.current_region_defaults.infiniband_fabric)
 }
 
 resource "random_string" "random" {

diff --git a/k8s-training/main.tf b/k8s-training/main.tf
@@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" {
       }
     ]
     resources = {
-      platform = var.cpu_nodes_platform
-      preset   = var.cpu_nodes_preset
+      platform = local.cpu_nodes_platform
+      preset   = local.cpu_nodes_preset
     }
     filesystems = var.enable_filestore ? [
       {
@@ -68,13 +68,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
     }
     network_interfaces = [
       {
-        subnet_id = var.subnet_id
+        subnet_id         = var.subnet_id
         public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null
       }
     ]
     resources = {
-      platform = var.gpu_nodes_platform
-      preset   = var.gpu_nodes_preset
+      platform = local.gpu_nodes_platform
+      preset   = local.gpu_nodes_preset
     }
     filesystems = var.enable_filestore ? [
       {

diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars
@@ -1,24 +1,28 @@
 # Cloud environment and network
 # parent_id      = "" # The project-id in this context
 # subnet_id      = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
+# region         = "" # Project region
 # ssh_user_name  = "" # Username you want to use to connect to the nodes
 # ssh_public_key = {
 # key  = "put your public ssh key here" OR
 # path = "put path to ssh key here"
 # }
 
-# K8s modes
-cpu_nodes_count  = 1                     # Number of CPU nodes
-cpu_nodes_preset = "16vcpu-64gb"         # The CPU node preset
-gpu_nodes_count  = 1                     # Number of GPU nodes
-gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection
+# K8s nodes
+cpu_nodes_count = 1 # Number of CPU nodes
+gpu_nodes_count = 1 # Number of GPU nodes
+# cpu_nodes_platform =                 # CPU nodes platofm
+# cpu_nodes_preset   =                 # CPU nodes preset
+# gpu_nodes_platform =                 # GPU nodes platform
+# gpu_nodes_preset   =                 # GPU nodes preset
+# infiniband_fabric  =                 # Infiniband fabric name.
 
 
 # Observability
-enable_grafana    = true # Enable or disable Grafana deployment with true or false
-enable_prometheus = true # Enable or disable Prometheus deployment with true or false
+enable_grafana    = true  # Enable or disable Grafana deployment with true or false
+enable_prometheus = true  # Enable or disable Prometheus deployment with true or false
 enable_loki       = false # Enable or disable Loki deployment with true or false
-enable_dcgm       = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false
+enable_dcgm       = true  # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false
 
 ## Loki
 # loki_access_key_id = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki.

diff --git a/k8s-training/variables.tf b/k8s-training/variables.tf
@@ -1,4 +1,4 @@
-# K8s cluster
+# Global
 variable "parent_id" {
   description = "Project ID."
   type        = string
@@ -9,6 +9,12 @@ variable "subnet_id" {
   type        = string
 }
 
+variable "region" {
+  description = "The current region."
+  type        = string
+}
+
+# K8s cluster
 variable "k8s_version" {
   description = "Kubernetes version to be used in the cluster."
   type        = string
@@ -114,13 +120,13 @@ variable "cpu_nodes_count" {
 variable "cpu_nodes_platform" {
   description = "Platform for nodes in the CPU-only node group."
   type        = string
-  default     = "cpu-e2"
+  default     = null
 }
 
 variable "cpu_nodes_preset" {
   description = "CPU and RAM configuration for nodes in the CPU-only node group."
   type        = string
-  default     = "16vcpu-64gb"
+  default     = null
 }
 
 variable "cpu_disk_type" {
@@ -145,13 +151,13 @@ variable "gpu_nodes_count" {
 variable "gpu_nodes_platform" {
   description = "Platform for nodes in the GPU node group."
   type        = string
-  default     = "gpu-h100-sxm"
+  default     = null
 }
 
 variable "gpu_nodes_preset" {
   description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group."
   type        = string
-  default     = "8gpu-128vcpu-1600gb"
+  default     = null
 }
 
 variable "gpu_disk_type" {
@@ -169,7 +175,7 @@ variable "gpu_disk_size" {
 variable "infiniband_fabric" {
   description = "Infiniband's fabric name."
   type        = string
-  default     = "fabric-3"
+  default     = null
 }
 
 variable "gpu_nodes_assign_public_ip" {

diff --git a/modules/gluster-module/instances.tf b/modules/gluster-module/instances.tf
@@ -14,8 +14,8 @@ resource "nebius_compute_v1_instance" "gluster-fs-instance" {
     }
   ]
   resources = {
-    platform = "cpu-e2"
-    preset   = "16vcpu-64gb"
+    platform = var.platform
+    preset   = var.preset
   }
 
   boot_disk = {