Skip to content

Commit

Permalink
Merge pull request #94 from nebius/feature/platform-and-preset-moved-…
Browse files Browse the repository at this point in the history
…to-parameters

Platform and preset moved to variables across the library;
  • Loading branch information
elijah-k-nebius authored Nov 25, 2024
2 parents 8584e20 + 3a0ab6b commit 88aab79
Show file tree
Hide file tree
Showing 33 changed files with 275 additions and 75 deletions.
1 change: 1 addition & 0 deletions .github/workflows/terraform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ jobs:

env:
TF_VAR_subnet_id: vpcsubnet-e00dgdntmhgkeej1z3
TF_VAR_region: eu-north1
TF_VAR_loki_access_key_id: ${{ secrets.SA_ACCESS_KEY_ID }}
TF_VAR_loki_secret_key: ${{ secrets.SA_SECRET_KEY }}

Expand Down
1 change: 1 addition & 0 deletions k8s-inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ There are additional configurable variables in `variables.tf`.
# Cloud environment and network
parent_id = "" # The project-id in this context
subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
region = "" # The project region.
ssh_user_name = "" # Username you want to use to connect to the nodes
ssh_public_key = {
key = "put your public ssh key here" OR
Expand Down
2 changes: 2 additions & 0 deletions k8s-inference/gluster-fs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ module "glusterfs" {
disk_count_per_vm = var.glusterfs_disk_count_per_vm
disk_size = var.glusterfs_disk_size
ssh_public_key = local.ssh_public_key
platform = local.cpu_nodes_platform
preset = local.cpu_nodes_preset
}
2 changes: 1 addition & 1 deletion k8s-inference/helm.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ module "o11y" {
enabled = var.enable_dcgm,
node_groups = {
node_group_name = {
gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0])
gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0])
instance_group_id = nebius_mk8s_v1_node_group.gpu.id
}
}
Expand Down
22 changes: 22 additions & 0 deletions k8s-inference/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,28 @@ locals {
release-suffix = random_string.random.result
ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : (
fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null)

regions_default = {
eu-west1 = {
cpu_nodes_platform = "cpu-d3"
cpu_nodes_preset = "16vcpu-64gb"
gpu_nodes_platform = "gpu-h200-sxm"
gpu_nodes_preset = "1gpu-16vcpu-200gb"
}
eu-north1 = {
cpu_nodes_platform = "cpu-e2"
cpu_nodes_preset = "16vcpu-64gb"
gpu_nodes_platform = "gpu-h100-sxm"
gpu_nodes_preset = "1gpu-16vcpu-200gb"
}
}

current_region_defaults = local.regions_default[var.region]

cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset)
cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform)
gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform)
gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset)
}

resource "random_string" "random" {
Expand Down
10 changes: 5 additions & 5 deletions k8s-inference/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" {
}
]
resources = {
platform = var.cpu_nodes_platform
preset = var.cpu_nodes_preset
platform = local.cpu_nodes_platform
preset = local.cpu_nodes_preset
}
filesystems = var.enable_filestore ? [
{
Expand Down Expand Up @@ -68,13 +68,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
}
network_interfaces = [
{
subnet_id = var.subnet_id
subnet_id = var.subnet_id
public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null
}
]
resources = {
platform = var.gpu_nodes_platform
preset = var.gpu_nodes_preset
platform = local.gpu_nodes_platform
preset = local.gpu_nodes_preset
}
filesystems = var.enable_filestore ? [
{
Expand Down
19 changes: 11 additions & 8 deletions k8s-inference/terraform.tfvars
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
# Cloud environment and network
# parent_id = "" # The project-id in this context
# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
# ssh_user_name = "" # Username you want to use to connect to the nodes
# parent_id = "" # The project-id in this context
# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
# region = "" # Project region
# ssh_user_name = "" # Username you want to use to connect to the nodes
# ssh_public_key = {
# key = "put your public ssh key here" OR
# path = "put path to ssh key here"
# }

# K8s modes
cpu_nodes_count = 1 # Number of CPU nodes
cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset
gpu_nodes_count = 1 # Number of GPU nodes
gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs.
# K8s nodes
cpu_nodes_count = 1 # Number of CPU nodes
gpu_nodes_count = 1 # Number of GPU nodes
# cpu_nodes_platform = # CPU nodes platofm
# cpu_nodes_preset = # CPU nodes preset
# gpu_nodes_platform = # GPU nodes platform
# gpu_nodes_preset = # GPU nodes preset

# Observability
enable_grafana = true # Enable or disable Grafana deployment with true or false
Expand Down
16 changes: 11 additions & 5 deletions k8s-inference/variables.tf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# K8s cluster
# Global
variable "parent_id" {
description = "Project ID."
type = string
Expand All @@ -9,6 +9,12 @@ variable "subnet_id" {
type = string
}

variable "region" {
description = "The current region."
type = string
}

# K8s cluster
variable "k8s_version" {
description = "Kubernetes version to be used in the cluster."
type = string
Expand Down Expand Up @@ -114,13 +120,13 @@ variable "cpu_nodes_count" {
variable "cpu_nodes_platform" {
description = "Platform for nodes in the CPU-only node group."
type = string
default = "cpu-e2"
default = null
}

variable "cpu_nodes_preset" {
description = "CPU and RAM configuration for nodes in the CPU-only node group."
type = string
default = "16vcpu-64gb"
default = null
}

variable "cpu_disk_type" {
Expand All @@ -145,13 +151,13 @@ variable "gpu_nodes_count" {
variable "gpu_nodes_platform" {
description = "Platform for nodes in the GPU node group."
type = string
default = "gpu-h100-sxm"
default = null
}

variable "gpu_nodes_preset" {
description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group."
type = string
default = "1gpu-16vcpu-200gb"
default = null
}

variable "gpu_disk_type" {
Expand Down
1 change: 1 addition & 0 deletions k8s-training/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ Additional configurable variables can be found in the `variables.tf` file.
# Cloud environment and network
parent_id = "" # The project-id in this context
subnet_id = "" # Run the `nebius vpc v1alpha1 network list` command to see the subnet id
region = "" # The project region
ssh_user_name = "" # Username you want to use to connect to the nodes
ssh_public_key = {
key = "Enter your public SSH key here" OR
Expand Down
4 changes: 2 additions & 2 deletions k8s-training/applications.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ module "kuberay" {

parent_id = var.parent_id
cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id
gpu_platform = var.gpu_nodes_platform
cpu_platform = var.cpu_nodes_platform
gpu_platform = local.gpu_nodes_platform
cpu_platform = local.cpu_nodes_platform
min_gpu_replicas = var.kuberay_min_gpu_replicas
max_gpu_replicas = var.kuberay_max_gpu_replicas
}
2 changes: 2 additions & 0 deletions k8s-training/gluster-fs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ module "glusterfs" {
disk_count_per_vm = var.glusterfs_disk_count_per_vm
disk_size = var.glusterfs_disk_size
ssh_public_key = local.ssh_public_key
platform = local.cpu_nodes_platform
preset = local.cpu_nodes_preset
}
4 changes: 2 additions & 2 deletions k8s-training/gpu_cluster.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
resource "nebius_compute_v1_gpu_cluster" "fabric_2" {
infiniband_fabric = var.infiniband_fabric
infiniband_fabric = local.infiniband_fabric
parent_id = var.parent_id
name = join("-", [var.infiniband_fabric, local.release-suffix])
name = join("-", [local.infiniband_fabric, local.release-suffix])
}
2 changes: 1 addition & 1 deletion k8s-training/helm.tf
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ module "o11y" {
enabled = var.enable_dcgm,
node_groups = {
node_group_name = {
gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0])
gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0])
instance_group_id = nebius_mk8s_v1_node_group.gpu.id
}
}
Expand Down
25 changes: 25 additions & 0 deletions k8s-training/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,31 @@ locals {
release-suffix = random_string.random.result
ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : (
fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null)

regions_default = {
eu-west1 = {
cpu_nodes_platform = "cpu-d3"
cpu_nodes_preset = "16vcpu-64gb"
gpu_nodes_platform = "gpu-h200-sxm"
gpu_nodes_preset = "8gpu-128vcpu-1600gb"
infiniband_fabric = "fabric-5"
}
eu-north1 = {
cpu_nodes_platform = "cpu-e2"
cpu_nodes_preset = "16vcpu-64gb"
gpu_nodes_platform = "gpu-h100-sxm"
gpu_nodes_preset = "8gpu-128vcpu-1600gb"
infiniband_fabric = "fabric-3"
}
}

current_region_defaults = local.regions_default[var.region]

cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset)
cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform)
gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform)
gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset)
infiniband_fabric = coalesce(var.infiniband_fabric, local.current_region_defaults.infiniband_fabric)
}

resource "random_string" "random" {
Expand Down
10 changes: 5 additions & 5 deletions k8s-training/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" {
}
]
resources = {
platform = var.cpu_nodes_platform
preset = var.cpu_nodes_preset
platform = local.cpu_nodes_platform
preset = local.cpu_nodes_preset
}
filesystems = var.enable_filestore ? [
{
Expand Down Expand Up @@ -68,13 +68,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
}
network_interfaces = [
{
subnet_id = var.subnet_id
subnet_id = var.subnet_id
public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null
}
]
resources = {
platform = var.gpu_nodes_platform
preset = var.gpu_nodes_preset
platform = local.gpu_nodes_platform
preset = local.gpu_nodes_preset
}
filesystems = var.enable_filestore ? [
{
Expand Down
20 changes: 12 additions & 8 deletions k8s-training/terraform.tfvars
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
# Cloud environment and network
# parent_id = "" # The project-id in this context
# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
# region = "" # Project region
# ssh_user_name = "" # Username you want to use to connect to the nodes
# ssh_public_key = {
# key = "put your public ssh key here" OR
# path = "put path to ssh key here"
# }

# K8s modes
cpu_nodes_count = 1 # Number of CPU nodes
cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset
gpu_nodes_count = 1 # Number of GPU nodes
gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection
# K8s nodes
cpu_nodes_count = 1 # Number of CPU nodes
gpu_nodes_count = 1 # Number of GPU nodes
# cpu_nodes_platform = # CPU nodes platofm
# cpu_nodes_preset = # CPU nodes preset
# gpu_nodes_platform = # GPU nodes platform
# gpu_nodes_preset = # GPU nodes preset
# infiniband_fabric = # Infiniband fabric name.


# Observability
enable_grafana = true # Enable or disable Grafana deployment with true or false
enable_prometheus = true # Enable or disable Prometheus deployment with true or false
enable_grafana = true # Enable or disable Grafana deployment with true or false
enable_prometheus = true # Enable or disable Prometheus deployment with true or false
enable_loki = false # Enable or disable Loki deployment with true or false
enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false
enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false

## Loki
# loki_access_key_id = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki.
Expand Down
18 changes: 12 additions & 6 deletions k8s-training/variables.tf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# K8s cluster
# Global
variable "parent_id" {
description = "Project ID."
type = string
Expand All @@ -9,6 +9,12 @@ variable "subnet_id" {
type = string
}

variable "region" {
description = "The current region."
type = string
}

# K8s cluster
variable "k8s_version" {
description = "Kubernetes version to be used in the cluster."
type = string
Expand Down Expand Up @@ -114,13 +120,13 @@ variable "cpu_nodes_count" {
variable "cpu_nodes_platform" {
description = "Platform for nodes in the CPU-only node group."
type = string
default = "cpu-e2"
default = null
}

variable "cpu_nodes_preset" {
description = "CPU and RAM configuration for nodes in the CPU-only node group."
type = string
default = "16vcpu-64gb"
default = null
}

variable "cpu_disk_type" {
Expand All @@ -145,13 +151,13 @@ variable "gpu_nodes_count" {
variable "gpu_nodes_platform" {
description = "Platform for nodes in the GPU node group."
type = string
default = "gpu-h100-sxm"
default = null
}

variable "gpu_nodes_preset" {
description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group."
type = string
default = "8gpu-128vcpu-1600gb"
default = null
}

variable "gpu_disk_type" {
Expand All @@ -169,7 +175,7 @@ variable "gpu_disk_size" {
variable "infiniband_fabric" {
description = "Infiniband's fabric name."
type = string
default = "fabric-3"
default = null
}

variable "gpu_nodes_assign_public_ip" {
Expand Down
4 changes: 2 additions & 2 deletions modules/gluster-module/instances.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ resource "nebius_compute_v1_instance" "gluster-fs-instance" {
}
]
resources = {
platform = "cpu-e2"
preset = "16vcpu-64gb"
platform = var.platform
preset = var.preset
}

boot_disk = {
Expand Down
Loading

0 comments on commit 88aab79

Please sign in to comment.