diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index 25cf059d..24e8a75a 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -39,6 +39,7 @@ jobs: env: TF_VAR_subnet_id: vpcsubnet-e00dgdntmhgkeej1z3 + TF_VAR_region: eu-north1 TF_VAR_loki_access_key_id: ${{ secrets.SA_ACCESS_KEY_ID }} TF_VAR_loki_secret_key: ${{ secrets.SA_SECRET_KEY }} diff --git a/k8s-inference/README.md b/k8s-inference/README.md index a57220b4..42bfb756 100644 --- a/k8s-inference/README.md +++ b/k8s-inference/README.md @@ -75,6 +75,7 @@ There are additional configurable variables in `variables.tf`. # Cloud environment and network parent_id = "" # The project-id in this context subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +region = "" # The project region. ssh_user_name = "" # Username you want to use to connect to the nodes ssh_public_key = { key = "put your public ssh key here" OR diff --git a/k8s-inference/gluster-fs.tf b/k8s-inference/gluster-fs.tf index 6a87b0e8..8d800f53 100644 --- a/k8s-inference/gluster-fs.tf +++ b/k8s-inference/gluster-fs.tf @@ -7,4 +7,6 @@ module "glusterfs" { disk_count_per_vm = var.glusterfs_disk_count_per_vm disk_size = var.glusterfs_disk_size ssh_public_key = local.ssh_public_key + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/k8s-inference/helm.tf b/k8s-inference/helm.tf index 41ca13e2..1e09c23b 100644 --- a/k8s-inference/helm.tf +++ b/k8s-inference/helm.tf @@ -30,7 +30,7 @@ module "o11y" { enabled = var.enable_dcgm, node_groups = { node_group_name = { - gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0]) + gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0]) instance_group_id = nebius_mk8s_v1_node_group.gpu.id } } diff --git a/k8s-inference/locals.tf b/k8s-inference/locals.tf index a4fb0a61..4edf97ef 100644 --- a/k8s-inference/locals.tf +++ b/k8s-inference/locals.tf @@ -2,6 +2,28 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset) } resource "random_string" "random" { diff --git a/k8s-inference/main.tf b/k8s-inference/main.tf index 52041518..76f605c8 100644 --- a/k8s-inference/main.tf +++ b/k8s-inference/main.tf @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" { } ] resources = { - platform = var.cpu_nodes_platform - preset = var.cpu_nodes_preset + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } filesystems = var.enable_filestore ? [ { @@ -68,13 +68,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } network_interfaces = [ { - subnet_id = var.subnet_id + subnet_id = var.subnet_id public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] resources = { - platform = var.gpu_nodes_platform - preset = var.gpu_nodes_preset + platform = local.gpu_nodes_platform + preset = local.gpu_nodes_preset } filesystems = var.enable_filestore ? [ { diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 7ff9869d..35b76296 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -1,17 +1,20 @@ # Cloud environment and network -# parent_id = "" # The project-id in this context -# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id -# ssh_user_name = "" # Username you want to use to connect to the nodes +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" # } -# K8s modes -cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset -gpu_nodes_count = 1 # Number of GPU nodes -gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs. +# K8s nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes +# cpu_nodes_platform = # CPU nodes platofm +# cpu_nodes_preset = # CPU nodes preset +# gpu_nodes_platform = # GPU nodes platform +# gpu_nodes_preset = # GPU nodes preset # Observability enable_grafana = true # Enable or disable Grafana deployment with true or false diff --git a/k8s-inference/variables.tf b/k8s-inference/variables.tf index ec1a994d..1087a015 100644 --- a/k8s-inference/variables.tf +++ b/k8s-inference/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,12 @@ variable "subnet_id" { type = string } +variable "region" { + description = "The current region." + type = string +} + +# K8s cluster variable "k8s_version" { description = "Kubernetes version to be used in the cluster." type = string @@ -114,13 +120,13 @@ variable "cpu_nodes_count" { variable "cpu_nodes_platform" { description = "Platform for nodes in the CPU-only node group." type = string - default = "cpu-e2" + default = null } variable "cpu_nodes_preset" { description = "CPU and RAM configuration for nodes in the CPU-only node group." type = string - default = "16vcpu-64gb" + default = null } variable "cpu_disk_type" { @@ -145,13 +151,13 @@ variable "gpu_nodes_count" { variable "gpu_nodes_platform" { description = "Platform for nodes in the GPU node group." type = string - default = "gpu-h100-sxm" + default = null } variable "gpu_nodes_preset" { description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group." type = string - default = "1gpu-16vcpu-200gb" + default = null } variable "gpu_disk_type" { diff --git a/k8s-training/README.md b/k8s-training/README.md index 7de0fd2a..1c62e18a 100644 --- a/k8s-training/README.md +++ b/k8s-training/README.md @@ -84,6 +84,7 @@ Additional configurable variables can be found in the `variables.tf` file. # Cloud environment and network parent_id = "" # The project-id in this context subnet_id = "" # Run the `nebius vpc v1alpha1 network list` command to see the subnet id +region = "" # The project region ssh_user_name = "" # Username you want to use to connect to the nodes ssh_public_key = { key = "Enter your public SSH key here" OR diff --git a/k8s-training/applications.tf b/k8s-training/applications.tf index 3e84067a..d48326c5 100644 --- a/k8s-training/applications.tf +++ b/k8s-training/applications.tf @@ -12,8 +12,8 @@ module "kuberay" { parent_id = var.parent_id cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id - gpu_platform = var.gpu_nodes_platform - cpu_platform = var.cpu_nodes_platform + gpu_platform = local.gpu_nodes_platform + cpu_platform = local.cpu_nodes_platform min_gpu_replicas = var.kuberay_min_gpu_replicas max_gpu_replicas = var.kuberay_max_gpu_replicas } diff --git a/k8s-training/gluster-fs.tf b/k8s-training/gluster-fs.tf index 6a87b0e8..8d800f53 100644 --- a/k8s-training/gluster-fs.tf +++ b/k8s-training/gluster-fs.tf @@ -7,4 +7,6 @@ module "glusterfs" { disk_count_per_vm = var.glusterfs_disk_count_per_vm disk_size = var.glusterfs_disk_size ssh_public_key = local.ssh_public_key + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/k8s-training/gpu_cluster.tf b/k8s-training/gpu_cluster.tf index 89cce8c5..472d950f 100644 --- a/k8s-training/gpu_cluster.tf +++ b/k8s-training/gpu_cluster.tf @@ -1,5 +1,5 @@ resource "nebius_compute_v1_gpu_cluster" "fabric_2" { - infiniband_fabric = var.infiniband_fabric + infiniband_fabric = local.infiniband_fabric parent_id = var.parent_id - name = join("-", [var.infiniband_fabric, local.release-suffix]) + name = join("-", [local.infiniband_fabric, local.release-suffix]) } diff --git a/k8s-training/helm.tf b/k8s-training/helm.tf index 1bf3755f..6bceeef5 100644 --- a/k8s-training/helm.tf +++ b/k8s-training/helm.tf @@ -39,7 +39,7 @@ module "o11y" { enabled = var.enable_dcgm, node_groups = { node_group_name = { - gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0]) + gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0]) instance_group_id = nebius_mk8s_v1_node_group.gpu.id } } diff --git a/k8s-training/locals.tf b/k8s-training/locals.tf index a4fb0a61..165efd27 100644 --- a/k8s-training/locals.tf +++ b/k8s-training/locals.tf @@ -2,6 +2,31 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-5" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-3" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset) + infiniband_fabric = coalesce(var.infiniband_fabric, local.current_region_defaults.infiniband_fabric) } resource "random_string" "random" { diff --git a/k8s-training/main.tf b/k8s-training/main.tf index e877161f..869a1b72 100644 --- a/k8s-training/main.tf +++ b/k8s-training/main.tf @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" { } ] resources = { - platform = var.cpu_nodes_platform - preset = var.cpu_nodes_preset + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } filesystems = var.enable_filestore ? [ { @@ -68,13 +68,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } network_interfaces = [ { - subnet_id = var.subnet_id + subnet_id = var.subnet_id public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] resources = { - platform = var.gpu_nodes_platform - preset = var.gpu_nodes_preset + platform = local.gpu_nodes_platform + preset = local.gpu_nodes_preset } filesystems = var.enable_filestore ? [ { diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index 5392c93f..f62dfe05 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -1,24 +1,28 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" # } -# K8s modes -cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset -gpu_nodes_count = 1 # Number of GPU nodes -gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection +# K8s nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes +# cpu_nodes_platform = # CPU nodes platofm +# cpu_nodes_preset = # CPU nodes preset +# gpu_nodes_platform = # GPU nodes platform +# gpu_nodes_preset = # GPU nodes preset +# infiniband_fabric = # Infiniband fabric name. # Observability -enable_grafana = true # Enable or disable Grafana deployment with true or false -enable_prometheus = true # Enable or disable Prometheus deployment with true or false +enable_grafana = true # Enable or disable Grafana deployment with true or false +enable_prometheus = true # Enable or disable Prometheus deployment with true or false enable_loki = false # Enable or disable Loki deployment with true or false -enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false +enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false ## Loki # loki_access_key_id = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki. diff --git a/k8s-training/variables.tf b/k8s-training/variables.tf index b17ba14b..65f6b71c 100644 --- a/k8s-training/variables.tf +++ b/k8s-training/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,12 @@ variable "subnet_id" { type = string } +variable "region" { + description = "The current region." + type = string +} + +# K8s cluster variable "k8s_version" { description = "Kubernetes version to be used in the cluster." type = string @@ -114,13 +120,13 @@ variable "cpu_nodes_count" { variable "cpu_nodes_platform" { description = "Platform for nodes in the CPU-only node group." type = string - default = "cpu-e2" + default = null } variable "cpu_nodes_preset" { description = "CPU and RAM configuration for nodes in the CPU-only node group." type = string - default = "16vcpu-64gb" + default = null } variable "cpu_disk_type" { @@ -145,13 +151,13 @@ variable "gpu_nodes_count" { variable "gpu_nodes_platform" { description = "Platform for nodes in the GPU node group." type = string - default = "gpu-h100-sxm" + default = null } variable "gpu_nodes_preset" { description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group." type = string - default = "8gpu-128vcpu-1600gb" + default = null } variable "gpu_disk_type" { @@ -169,7 +175,7 @@ variable "gpu_disk_size" { variable "infiniband_fabric" { description = "Infiniband's fabric name." type = string - default = "fabric-3" + default = null } variable "gpu_nodes_assign_public_ip" { diff --git a/modules/gluster-module/instances.tf b/modules/gluster-module/instances.tf index 3c3450ea..ea057e77 100644 --- a/modules/gluster-module/instances.tf +++ b/modules/gluster-module/instances.tf @@ -14,8 +14,8 @@ resource "nebius_compute_v1_instance" "gluster-fs-instance" { } ] resources = { - platform = "cpu-e2" - preset = "16vcpu-64gb" + platform = var.platform + preset = var.preset } boot_disk = { diff --git a/nfs-server/locals.tf b/nfs-server/locals.tf index ed79b470..cba34bbe 100644 --- a/nfs-server/locals.tf +++ b/nfs-server/locals.tf @@ -1,4 +1,20 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) } diff --git a/nfs-server/main.tf b/nfs-server/main.tf index 10d495ac..39085d14 100644 --- a/nfs-server/main.tf +++ b/nfs-server/main.tf @@ -9,4 +9,6 @@ module "nfs-module" { ssh_public_key = var.ssh_public_key.key nfs_ip_range = var.nfs_ip_range nfs_size = var.nfs_size + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/nfs-server/nfs.tfvars b/nfs-server/nfs.tfvars deleted file mode 100644 index 275cf081..00000000 --- a/nfs-server/nfs.tfvars +++ /dev/null @@ -1,8 +0,0 @@ -parent_id = "project-..." -subnet_id = "vpcsubnet-..." -ssh_user_name = "nfs" -ssh_public_key = { - key = "put your ssh key here" - # path = "or put path to ssh key here" -} -nfs_ip_range = "192.168.0.0/16" diff --git a/nfs-server/terraform.tfvars b/nfs-server/terraform.tfvars new file mode 100644 index 00000000..b9b7adf1 --- /dev/null +++ b/nfs-server/terraform.tfvars @@ -0,0 +1,9 @@ +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_public_key = { +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" +# } +nfs_ip_range = "192.168.0.0/16" diff --git a/nfs-server/variables.tf b/nfs-server/variables.tf index 369cefba..a83f100f 100644 --- a/nfs-server/variables.tf +++ b/nfs-server/variables.tf @@ -8,6 +8,23 @@ variable "subnet_id" { description = "ID of the subnet." } +variable "region" { + type = string + description = "Project region." +} + +variable "cpu_nodes_platform" { + description = "Platform for instances." + type = string + default = null +} + +variable "cpu_nodes_preset" { + description = "CPU and RAM configuration for instances." + type = string + default = null +} + variable "nfs_size" { type = number default = 93 * 1024 * 1024 * 1024 # size should be a multiple of 99857989632 diff --git a/slurm/locals.tf b/slurm/locals.tf index ed79b470..90fe2b56 100644 --- a/slurm/locals.tf +++ b/slurm/locals.tf @@ -1,4 +1,27 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + + regions_default = { + eu-west1 = { + master_platform = "cpu-d3" + master_preset = "16vcpu-64gb" + worker_platform = "gpu-h200-sxm" + worker_preset = "8gpu-128vcpu-1600gb" + } + eu-north1 = { + master_platform = "cpu-e2" + master_preset = "16vcpu-64gb" + worker_platform = "gpu-h100-sxm" + worker_preset = "8gpu-128vcpu-1600gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + master_platform = coalesce(var.master_platform, local.current_region_defaults.master_platform) + master_preset = coalesce(var.master_preset, local.current_region_defaults.master_preset) + worker_platform = coalesce(var.worker_platform, local.current_region_defaults.worker_platform) + worker_preset = coalesce(var.worker_preset, local.current_region_defaults.worker_preset) } diff --git a/slurm/nfs.tf b/slurm/nfs.tf index 552ab197..d0fe16c8 100644 --- a/slurm/nfs.tf +++ b/slurm/nfs.tf @@ -10,4 +10,6 @@ module "nfs-module" { ssh_public_key = local.ssh_public_key nfs_ip_range = "192.168.0.0/16" nfs_size = var.fs_size + platform = local.master_platform + preset = local.master_preset } diff --git a/slurm/slurm-master.tf b/slurm/slurm-master.tf index 26f3075b..dda58492 100644 --- a/slurm/slurm-master.tf +++ b/slurm/slurm-master.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "master" { name = "slurm-master" parent_id = var.parent_id resources = { - platform = "cpu-e2" - preset = "4vcpu-16gb" + platform = local.master_platform + preset = local.master_preset } boot_disk = { attach_mode = "READ_WRITE" diff --git a/slurm/slurm-worker.tf b/slurm/slurm-worker.tf index c1825c4f..2bbfc0a6 100644 --- a/slurm/slurm-worker.tf +++ b/slurm/slurm-worker.tf @@ -29,8 +29,8 @@ resource "nebius_compute_v1_instance" "worker" { name = each.key parent_id = var.parent_id resources = { - platform = "gpu-h100-sxm" - preset = "8gpu-128vcpu-1600gb" + platform = local.worker_platform + preset = local.worker_preset } gpu_cluster = nebius_compute_v1_gpu_cluster.gpu-cluster-slurm diff --git a/slurm/terraform.tfvars b/slurm/terraform.tfvars index 1858faef..3c1ff8a4 100644 --- a/slurm/terraform.tfvars +++ b/slurm/terraform.tfvars @@ -1,9 +1,16 @@ -parent_id = "project-e00..." -subnet_id = "vpcsubnet-e00..." +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_public_key = { +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" +# } cluster_workers_count = 2 # amount of workers mysql_jobs_backend = false # Do you want to use mysql shared_fs_type = "filesystem" # "nfs" or "filesystem" -# ssh_public_key = { -# key = "put your public ssh key here" -# path = "put path to ssh key here" -# } + +# master_platform = +# master_preset = +# worker_platform = +# worker_preset = \ No newline at end of file diff --git a/slurm/variables.tf b/slurm/variables.tf index 565a04fc..f128c0d8 100644 --- a/slurm/variables.tf +++ b/slurm/variables.tf @@ -5,6 +5,11 @@ variable "subnet_id" { type = string } +variable "region" { + description = "Project region." + type = string +} + variable "ib_image_id" { type = string description = "ID of Infiniband image" @@ -29,12 +34,29 @@ variable "ssh_public_key" { } } -variable "platform_id" { +variable "master_platform" { + description = "Platform for Slurm Master." + type = string + default = null +} + +variable "master_preset" { + description = "Preset for Slurm Master." type = string - description = "Platform for workers: gpu-h100-b for Inspur or gpu-h100 for Gigabyte" - default = "gpu-h100-b" + default = null } +variable "worker_platform" { + description = "Platform for Slurm Worker." + type = string + default = null +} + +variable "worker_preset" { + description = "Preset for Slurm Worker." + type = string + default = null +} variable "mysql_jobs_backend" { type = bool diff --git a/wireguard/locals.tf b/wireguard/locals.tf index ed79b470..8c7a63f2 100644 --- a/wireguard/locals.tf +++ b/wireguard/locals.tf @@ -1,4 +1,21 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + platform = "cpu-d3" + preset = "16vcpu-64gb" + } + eu-north1 = { + platform = "cpu-e2" + preset = "16vcpu-64gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + platform = coalesce(var.platform, local.current_region_defaults.platform) + preset = coalesce(var.preset, local.current_region_defaults.preset) + } diff --git a/wireguard/main.tf b/wireguard/main.tf index 4bc7ba4d..9b3abc99 100644 --- a/wireguard/main.tf +++ b/wireguard/main.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "wireguard_instance" { ] resources = { - platform = "cpu-e2" - preset = "4vcpu-16gb" + platform = local.platform + preset = local.preset } diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 38986d48..79c25e96 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -1,8 +1,9 @@ -# parent_id = "" -# subnet_id = "" -# ssh_user_name = "ubuntu" +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { -# key = "put your public ssh key here" -# path = "put path to ssh key here" +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" # } -# public_ip_allocation_id = "" +# public_ip_allocation_id = "" \ No newline at end of file diff --git a/wireguard/variables.tf b/wireguard/variables.tf index 8d14ea3b..1b0d7c9b 100644 --- a/wireguard/variables.tf +++ b/wireguard/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global parameters variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,25 @@ variable "subnet_id" { type = string } +variable "region" { + description = "Project region." + type = string +} + + +# Platform +variable "platform" { + description = "Platform for WireGuard host." + type = string + default = null +} + +variable "preset" { + description = "Preset for WireGuard host." + type = string + default = null +} + # SSH access variable "ssh_user_name" { description = "SSH username."