nebius · shoguevara · Nov 25, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
@@ -39,6 +39,7 @@ jobs:
 
     env:
       TF_VAR_subnet_id: vpcsubnet-e00dgdntmhgkeej1z3
+      TF_VAR_region: eu-north1
       TF_VAR_loki_access_key_id: ${{ secrets.SA_ACCESS_KEY_ID }}
       TF_VAR_loki_secret_key: ${{ secrets.SA_SECRET_KEY }}
 

@@ -75,6 +75,7 @@ There are additional configurable variables in `variables.tf`.
 # Cloud environment and network
 parent_id      = "" # The project-id in this context
 subnet_id      = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
+region         = "" # The project region.
 ssh_user_name  = "" # Username you want to use to connect to the nodes
 ssh_public_key = {
   key  = "put your public ssh key here" OR
@@ -266,13 +267,13 @@ apiVersion: v1
 metadata:
   name: external-storage-persistent-volume
 spec:
-  storageClassName: hostpath
+  storageClassName: csi-mounted-fs-path-sc
   capacity:
     storage: "<SIZE>"
   accessModes:
     - ReadWriteMany
   hostPath:
-    path: "<HOST-PATH>" # "/mnt/filestore/<sub-directory>" or "/mnt/glusterfs/<sub-directory>"
+    path: "<HOST-PATH>" # "/mnt/data/<sub-directory>" or "/mnt/glusterfs/<sub-directory>"
 
 ---
 
@@ -281,10 +282,21 @@ apiVersion: v1
 metadata:
   name: external-storage-persistent-volumeclaim
 spec:
-  storageClassName: hostpath
+  storageClassName: csi-mounted-fs-path-sc
   accessModes:
     - ReadWriteMany
   resources:
     requests:
       storage: "<SIZE>"
 ```
+
+## CSI limitations:
+- FS should be mounted to all NodeGroups, because PV attachmend to pod runniing on Node without FS will fail
+- One PV may fill up to all common FS size
+- FS size will not be autoupdated if PV size exceed it spec size
+- FS size for now can't be updated through API, only through NEBOPS. (thread)
+- volumeMode: Block  - is not possible
+
+## Good to know:
+- read-write many mode PV will work
+- MSP started testing that solution to enable early integration with mk8s.
@@ -7,4 +7,6 @@ module "glusterfs" {
   disk_count_per_vm = var.glusterfs_disk_count_per_vm
   disk_size         = var.glusterfs_disk_size
   ssh_public_key    = local.ssh_public_key
+  platform          = local.cpu_nodes_platform
+  preset            = local.cpu_nodes_preset
 }
@@ -30,7 +30,7 @@ module "o11y" {
       enabled = var.enable_dcgm,
       node_groups = {
         node_group_name = {
-          gpus              = tonumber(split("gpu-", var.gpu_nodes_preset)[0])
+          gpus              = tonumber(split("gpu-", local.gpu_nodes_preset)[0])
           instance_group_id = nebius_mk8s_v1_node_group.gpu.id
         }
       }
@@ -39,3 +39,8 @@ module "o11y" {
   }
   test_mode = var.test_mode
 }
+
+module "csi-mounted-fs-path" {
+  source = "../modules/csi-mounted-fs-path"
+  count  = var.enable_filestore ? 1 : 0
+}
@@ -2,6 +2,28 @@ locals {
   release-suffix = random_string.random.result
   ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : (
   fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null)
+
+  regions_default = {
+    eu-west1 = {
+      cpu_nodes_platform = "cpu-d3"
+      cpu_nodes_preset   = "16vcpu-64gb"
+      gpu_nodes_platform = "gpu-h200-sxm"
+      gpu_nodes_preset   = "1gpu-16vcpu-200gb"
+    }
+    eu-north1 = {
+      cpu_nodes_platform = "cpu-e2"
+      cpu_nodes_preset   = "16vcpu-64gb"
+      gpu_nodes_platform = "gpu-h100-sxm"
+      gpu_nodes_preset   = "1gpu-16vcpu-200gb"
+    }
+  }
+
+  current_region_defaults = local.regions_default[var.region]
+
+  cpu_nodes_preset   = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset)
+  cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform)
+  gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform)
+  gpu_nodes_preset   = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset)
 }
 
 resource "random_string" "random" {

@@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" {
       }
     ]
     resources = {
-      platform = var.cpu_nodes_platform
-      preset   = var.cpu_nodes_preset
+      platform = local.cpu_nodes_platform
+      preset   = local.cpu_nodes_preset
     }
     filesystems = var.enable_filestore ? [
       {
@@ -68,13 +68,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
     }
     network_interfaces = [
       {
-        subnet_id = var.subnet_id
-        public_ip = var.gpu_nodes_assign_public_ip ? {} : null
+        subnet_id         = var.subnet_id
+        public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null
       }
     ]
     resources = {
-      platform = var.gpu_nodes_platform
-      preset   = var.gpu_nodes_preset
+      platform = local.gpu_nodes_platform
+      preset   = local.gpu_nodes_preset
     }
     filesystems = var.enable_filestore ? [
       {

@@ -1,17 +1,20 @@
 # Cloud environment and network
-# parent_id      = ""                                                                         # The project-id in this context
-# subnet_id      = ""                                                                       # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
-# ssh_user_name  = ""                                                                                               # Username you want to use to connect to the nodes
+# parent_id      = "" # The project-id in this context
+# subnet_id      = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
+# region         = "" # Project region
+# ssh_user_name  = "" # Username you want to use to connect to the nodes
 # ssh_public_key = {
 # key  = "put your public ssh key here" OR
 # path = "put path to ssh key here"
 # }
 
-# K8s modes
-cpu_nodes_count  = 1                   # Number of CPU nodes
-cpu_nodes_preset = "16vcpu-64gb"       # The CPU node preset
-gpu_nodes_count  = 1                   # Number of GPU nodes
-gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs.
+# K8s nodes
+cpu_nodes_count = 1 # Number of CPU nodes
+gpu_nodes_count = 1 # Number of GPU nodes
+# cpu_nodes_platform =                 # CPU nodes platofm
+# cpu_nodes_preset   =                 # CPU nodes preset
+# gpu_nodes_platform =                 # GPU nodes platform
+# gpu_nodes_preset   =                 # GPU nodes preset
 
 # Observability
 enable_grafana    = true  # Enable or disable Grafana deployment with true or false

@@ -1,4 +1,4 @@
-# K8s cluster
+# Global
 variable "parent_id" {
   description = "Project ID."
   type        = string
@@ -9,6 +9,12 @@ variable "subnet_id" {
   type        = string
 }
 
+variable "region" {
+  description = "The current region."
+  type        = string
+}
+
+# K8s cluster
 variable "k8s_version" {
   description = "Kubernetes version to be used in the cluster."
   type        = string
@@ -114,13 +120,13 @@ variable "cpu_nodes_count" {
 variable "cpu_nodes_platform" {
   description = "Platform for nodes in the CPU-only node group."
   type        = string
-  default     = "cpu-e2"
+  default     = null
 }
 
 variable "cpu_nodes_preset" {
   description = "CPU and RAM configuration for nodes in the CPU-only node group."
   type        = string
-  default     = "16vcpu-64gb"
+  default     = null
 }
 
 variable "cpu_disk_type" {
@@ -145,13 +151,13 @@ variable "gpu_nodes_count" {
 variable "gpu_nodes_platform" {
   description = "Platform for nodes in the GPU node group."
   type        = string
-  default     = "gpu-h100-sxm"
+  default     = null
 }
 
 variable "gpu_nodes_preset" {
   description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group."
   type        = string
-  default     = "1gpu-16vcpu-200gb"
+  default     = null
 }
 
 variable "gpu_disk_type" {