From 569d071adb76e1980e5f7a7db9d3755921428f67 Mon Sep 17 00:00:00 2001 From: Irakliy Glunchadze <28791638+iglunchadze@users.noreply.github.com> Date: Mon, 28 Oct 2024 10:17:43 +0100 Subject: [PATCH 01/32] Update formatting --- k8s-training/README.md | 75 +++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/k8s-training/README.md b/k8s-training/README.md index b115bf68..ced07515 100644 --- a/k8s-training/README.md +++ b/k8s-training/README.md @@ -11,54 +11,57 @@ - Installing [Promtail](https://github.com/grafana/helm-charts/tree/main/charts/promtail). ## Prerequisites + 1. Install [Nebius CLI](https://docs.nebius.ai/cli/install/): - ```bash - curl -sSL https://storage.ai.nebius.cloud/nebius/install.sh | bash - ``` + ```bash + curl -sSL https://storage.ai.nebius.cloud/nebius/install.sh | bash + ``` 2. Reload your shell session: - ```bash - exec -l $SHELL - ``` + + ```bash + exec -l $SHELL + ``` + or - ```bash - source ~/.bashrc - ``` + ```bash + source ~/.bashrc + ``` 3. [Configure](https://docs.nebius.ai/cli/configure/) Nebius CLI (it's recommended to use [service account](https://docs.nebius.ai/iam/service-accounts/manage/) for configuration): - ```bash - nebius init - ``` + ```bash + nebius init + ``` 3. Install JQuery (example for Debian based distros): - ```bash - sudo apt install jq -y - ``` + ```bash + sudo apt install jq -y + ``` ## Usage Follow these steps to deploy the Kubernetes cluster: 1. Load environment variables: - ```bash - source ./environment.sh - ``` + ```bash + source ./environment.sh + ``` 2. Initialize Terraform: - ```bash - terraform init - ``` + ```bash + terraform init + ``` 3. Replace the placeholder content in `terraform.tfvars` with actual configuration values to fit your specific requirements. See the details [bellow](#configuration-variables). 4. Preview the deployment plan: - ```bash - terraform plan - ``` + ```bash + terraform plan + ``` 5. Apply the configuration: - ```bash - terraform apply - ``` + ```bash + terraform apply + ``` Wait for the operation to complete. ## Configuration Variables @@ -68,6 +71,7 @@ These are the basic configurations needed to deploy Kubernetes for Training in N There are additional configurable variables in `variables.tf`. ### Environment and network variables + ```hcl # Cloud environment and network parent_id = "" # The project-id in this context @@ -80,6 +84,7 @@ ssh_public_key = { ``` ### Kubernetes nodes + ```hcl # K8s modes cpu_nodes_count = 1 # Number of CPU nodes @@ -89,6 +94,7 @@ gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Set to "1gpu-16v ``` ### Observability options + ```hcl # Observability enable_grafana = true # Enable or disable Grafana deployment with true or false @@ -106,6 +112,7 @@ Check the details below for more information on [Grafana](#grafana), [Prometheus > Deploying Loki will require you to create a service account! Please check the instructions [here](#temporary-block-to-make-loki-work-now)! ### Storage configuration + ```hcl # Storage ## Filestore - recommended @@ -158,14 +165,16 @@ Check [here](#accessing-storage) how to access storage in K8S. ### Connect to the cluster Show cluster information: - ```bash - kubectl cluster-info - ``` + +```bash +kubectl cluster-info +``` Get pods: - ```bash - kubectl get pods -A - ``` + +```bash +kubectl get pods -A +``` ## Observability From 17f82f789cff28fd6eb45a194d6e54027c98f849 Mon Sep 17 00:00:00 2001 From: Irakliy Glunchadze <28791638+iglunchadze@users.noreply.github.com> Date: Mon, 28 Oct 2024 10:18:54 +0100 Subject: [PATCH 02/32] Update wording --- k8s-training/README.md | 205 +++++++++++++++++++++-------------------- 1 file changed, 104 insertions(+), 101 deletions(-) diff --git a/k8s-training/README.md b/k8s-training/README.md index ced07515..d4e994da 100644 --- a/k8s-training/README.md +++ b/k8s-training/README.md @@ -1,9 +1,9 @@ -# Kubernetes for Training in Nebius AI +# Kubernetes for training in Nebius AI ## Features - Creating a Kubernetes cluster with CPU and GPU nodes. -- Installing the necessary [Nvidia Gpu Operator](https://github.com/NVIDIA/gpu-operator) +- Installing the required [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) and [Network Operator](https://docs.nvidia.com/networking/display/cokan10/network+operator) for running GPU workloads.- Installing [Grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana). - Installing [Prometheus](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus). @@ -29,12 +29,12 @@ source ~/.bashrc ``` -3. [Configure](https://docs.nebius.ai/cli/configure/) Nebius CLI (it's recommended to use [service account](https://docs.nebius.ai/iam/service-accounts/manage/) for configuration): +3. [Configure](https://docs.nebius.ai/cli/configure/) Nebius CLI (we recommend using a [service account](https://docs.nebius.ai/iam/service-accounts/manage/) for configuration): ```bash nebius init ``` -3. Install JQuery (example for Debian based distros): +4. Install JQuery (example for Debian-based distros): ```bash sudo apt install jq -y ``` @@ -52,8 +52,8 @@ Follow these steps to deploy the Kubernetes cluster: terraform init ``` 3. Replace the placeholder content - in `terraform.tfvars` with actual configuration values to fit your specific - requirements. See the details [bellow](#configuration-variables). + in `terraform.tfvars` with configuration values that meet your specific + requirements. See the details [below](#configuration-variables). 4. Preview the deployment plan: ```bash terraform plan @@ -64,22 +64,22 @@ Follow these steps to deploy the Kubernetes cluster: ``` Wait for the operation to complete. -## Configuration Variables +## Configuration variables -These are the basic configurations needed to deploy Kubernetes for Training in Nebius AI. Edit in the configurations that you need in the file `terraform.tfvars`. +These are the basic configurations required to deploy Kubernetes for training in Nebius AI. Edit the configurations as necessary in the `terraform.tfvars` file. -There are additional configurable variables in `variables.tf`. +Additional configurable variables can be found in the `variables.tf` file. ### Environment and network variables ```hcl # Cloud environment and network parent_id = "" # The project-id in this context -subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +subnet_id = "" # Run the `nebius vpc v1alpha1 network list` command to see the subnet id ssh_user_name = "" # Username you want to use to connect to the nodes ssh_public_key = { - key = "put your public ssh key here" OR - path = "put path to ssh key here" + key = "Enter your public SSH key here" OR + path = "Enter the path to your SSH key here" } ``` @@ -88,9 +88,9 @@ ssh_public_key = { ```hcl # K8s modes cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset +cpu_nodes_preset = "16vcpu-64gb" # CPU node preset gpu_nodes_count = 1 # Number of GPU nodes -gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Set to "1gpu-16vcpu-200gb", to deploy nodes with 8 GPUs. +gpu_nodes_preset = "8gpu-128vcpu-1600gb" # GPU node preset. Set the value to "1gpu-16vcpu-200gb" to deploy nodes with 8 GPUs. ``` ### Observability options @@ -100,16 +100,16 @@ gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Set to "1gpu-16v enable_grafana = true # Enable or disable Grafana deployment with true or false enable_prometheus = true # Enable or disable Prometheus deployment with true or false enable_loki = true # Enable or disable Loki deployment with true or false -enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false +enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment using true or false ## Loki -loki_access_key_id = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki. -loki_secret_key = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki. +loki_access_key_id = "" # See README.md for instructions. Leave empty if you are not deploying Loki. +loki_secret_key = "" # See the instruction in README.md on how to create this. If you are not deploying Loki, leave it empty. ``` -Check the details below for more information on [Grafana](#grafana), [Prometheus](#prometheus), [Loki](#temporary-block-to-make-loki-work-now) and [NVIDIA DCGM](#nvidia-dcgm-exporter-dashboard-and-alerting). +See the details below for more information on [Grafana](#grafana), [Prometheus](#prometheus), [Loki](#temporary-block-to-make-loki-work-now) and [NVIDIA DCGM](#nvidia-dcgm-exporter-dashboard-and-alerting). -> Deploying Loki will require you to create a service account! Please check the instructions [here](#temporary-block-to-make-loki-work-now)! +> To deploy Loki, you will need to create a service account. See the instructions [here](#temporary-block-to-make-loki-work-now). ### Storage configuration @@ -117,53 +117,60 @@ Check the details below for more information on [Grafana](#grafana), [Prometheus # Storage ## Filestore - recommended enable_filestore = true # Enable or disable Filestore integration with true or false -filestore_disk_size = 100 * (1024 * 1024 * 1024) #Set Filestore disk size in bytes. The multiplication makes it easier to set the size in GB. This would set the size as 100GB -filestore_block_size = 4096 # Set Filestore block size in bytes +filestore_disk_size = 100 * (1024 * 1024 * 1024) #Set the Filestore disk size in bytes. The multiplication makes it easier to set the size in GB, giving you a total of 100 GB +filestore_block_size = 4096 # Set the Filestore block size in bytes ## GlusterFS - legacy enable_glusterfs = false # Enable or disable GlusterFS integration with true or false -glusterfs_storage_nodes = 3 # Set amount of storage nodes in GlusterFS cluster -glusterfs_disk_count_per_vm = 2 # Set amount of disks per storage node in GlusterFS cluster -glusterfs_disk_size = 100 * (1024 * 1024 * 1024) #Set disk size in bytes. The multiplication makes it easier to set the size in GB. This would set the size as 100GB +glusterfs_storage_nodes = 3 # Set the number of storage nodes in the GlusterFS cluster +glusterfs_disk_count_per_vm = 2 # Set the number of disks per storage node in the GlusterFS cluster +glusterfs_disk_size = 100 * (1024 * 1024 * 1024) #Set the disk size in bytes. The multiplication makes it easier to set the size in GB, giving you a total of 100 GB. ``` -There are two options available for adding external storage to k8s clusters: +There are two ways to add external storage to K8s clusters: - Filestore (recommended, enabled by default) - GlusterFS (legacy) -Both would allow creating a Read-Write-Many HostPath PVCs in k8s cluster. Path for Filestore is `/mnt/filestore`, for -GlusterFS it is `/mnt/glusterfs`. +Both options allow you to create a Read-Write-Many HostPath PVCs in a K8s cluster. Use the following paths: `/mnt/filestore` for Filestore, `/mnt/glusterfs` for +GlusterFS. -Check [here](#accessing-storage) how to access storage in K8S. +For more information on how to access storage in K8s, refer [here](#accessing-storage). ## Connecting to the cluster -### Prepare your environment -* Install kubectl ([instructions](https://kubernetes.io/docs/tasks/tools/#kubectl)) -* Install Nebius AI CLI ([instructions](https://docs.nebius.ai/cli/install)) - also required for deploying the cluster -* Install JQ ([instructions](https://jqlang.github.io/jq/download/)) - also required for deploying the cluster +### Preparing the environment + +- Install kubectl ([instructions](https://kubernetes.io/docs/tasks/tools/#kubectl)) +- Install the Nebius AI CLI ([instructions](https://docs.nebius.ai/cli/install)) +- Install jq ([instructions](https://jqlang.github.io/jq/download/)) + +### Adding credentials to the kubectl configuration file + +1. Perform the following command from the terraform deployment folder: + +```bash +nebius mk8s v1 cluster get-credentials --id $(cat terraform.tfstate | jq -r '.resources[] | select(.type == "nebius_mk8s_v1_cluster") | .instances[].attributes.id') --external +``` -### Add credentials to the kubectl configuration file -1. Perform this command from the terraform deployment folder: - ```bash - nebius mk8s v1 cluster get-credentials --id $(cat terraform.tfstate | jq -r '.resources[] | select(.type == "nebius_mk8s_v1_cluster") | .instances[].attributes.id') --external - ``` 2. Verify the kubectl configuration after adding the credentials: - ```bash - kubectl config view - ``` - - The output should resemble:: - ```bash - apiVersion: v1 - clusters: - - cluster: - certificate-authority-data: DATA+OMITTED - ... - ``` - -### Connect to the cluster + +```bash +kubectl config view +``` + +The output should look like this: + +```bash +apiVersion: v1 +clusters: + - cluster: + certificate-authority-data: DATA+OMITTED +... +``` + +### Connecting to the cluster + Show cluster information: ```bash @@ -178,7 +185,7 @@ kubectl get pods -A ## Observability -Observability stack is enabled by default. It consist of the following: +Observability stack is enabled by default. It includes the following components: - Grafana - Prometheus @@ -186,83 +193,82 @@ Observability stack is enabled by default. It consist of the following: ### Grafana -Could be disabled by setting follwing in set `enable_grafana` variable to `false` in terraform.tfvars` file. +To disable it, set the `enable_grafana` variable to `false` in the `terraform.tfvars` file. To access Grafana: -1. **Port-Forward to the Grafana Service:** Run the following command to port-forward to the Grafana service: +1. **Port-forward to the Grafana service:** Run the following command to port-forward to the Grafana service: ```sh kubectl --namespace o11y port-forward service/grafana 8080:80 ``` -2. **Access Grafana Dashboard:** Open your browser and navigate to `http://localhost:8080`. +2. **Access the Grafana dashboard:** Open your browser and go to `http://localhost:8080`. -3. **Log In:** Use the default credentials to log in: - - **Username:** `admin` - - **Password:** `admin` +3. **Log in:** Use the default credentials to log in: + - **Username:** `admin` + - **Password:** `admin` -### Log Aggregation +### Log aggregation -#### Temporary block to make Loki work now +#### Create a temporary block to enable Loki -1. Create an SA - 2. `nebius iam service-account create --parent-id --name `. -2. Add SA to editors group. - 3. Get your tenant id with `nebius iam whoami`. - 4. Get the `editors` group id with: `nebius iam group list --parent-id | grep -n5 "name: editors"`. - 3. List all members of the `editors` group - with `nebius iam group-membership list-members --parent-id `. - 4. Add your SA to the `editors` group - with `nebius iam group-membership create --parent-id --member-id ` +1. Create a SA + 2. `nebius iam service-account create --parent-id --name `. +2. Add an SA to editors group. + 3. Get your tenant id using `nebius iam whoami`. + 4. Get the `editors` group id using `nebius iam group list --parent-id | grep -n5 "name: editors"`. + 3. List all members of the `editors` group + with `nebius iam group-membership list-members --parent-id `. + 4. Add your SA to the `editors` group + with `nebius iam group-membership create --parent-id --member-id ` 3. Create access key and get its credentials: - 4. `nebius iam access-key create --account-service-account-id --description 'AWS CLI' --format json` - 5. `nebius iam access-key get-by-aws-id --aws-access-key-id --view secret --format json` -4. Update `loki_access_key_id` and `loki_secret_key` in `terraform.tfvars` with info from the last command. + 4. `nebius iam access-key create --account-service-account-id --description 'AWS CLI' --format json` + 5. `nebius iam access-key get-by-aws-id --aws-access-key-id --view secret --format json` +4. Update `loki_access_key_id` and `loki_secret_key` in `terraform.tfvars` with the result of the previous command. -Log aggregation with the Loki is enabled by default. If you need to disable it, set `enable_loki` variable to `false` in +Log aggregation with Loki is enabled by default. If you want to disable it, set the `enable_loki` variable to `false` in the `terraform.tfvars` file. -To access logs navigate to Loki dashboard `http://localhost:8080/d/o6-BGgnnk/loki-kubernetes-logs` +To access logs, go to the Loki dashboard `http://localhost:8080/d/o6-BGgnnk/loki-kubernetes-logs`. -**NB!** You would have to manually clean loki bucket before doing `terraform destroy` +**NB!** You will have to manually clean the Loki bucket before performing the `terraform destroy` command. ### Prometheus -Prometheus server is enabled by default. If you need to disable it, set `enable_prometheus` variable to `false` in -terraform.tfvars` file. -Because `DCGM exporter` uses Prometheus as a datasource it will be disabled as well. +Prometheus server is enabled by default. If you want to disable it, set the `enable_prometheus` variable to `false` in the `terraform.tfvars` file. +Because `DCGM exporter` uses Prometheus as a data source it will also be disabled. -To access logs navigate to Node exporter folder `http://localhost:8080/f/e6acfbcb-6f13-4a58-8e02-f780811a2404/` +To access logs, go to the Node exporter folder `http://localhost:8080/f/e6acfbcb-6f13-4a58-8e02-f780811a2404/` ### NVIDIA DCGM Exporter Dashboard and Alerting -NVIDIA DCGM Exporter Dashboard and Alerting rules are enabled by default. If you need to disable it, set `enable_dcgm` -variable to `false` in terraform.tfvars` file. +NVIDIA DCGM Exporter Dashboard and Alerting rules are enabled by default. If you need to disable it, set the `enable_dcgm` variable to `false` in terraform.tfvars\` file. -By default Alerting rules are created for node groups that has GPUs. +Alerting rules are created for node groups with GPUs by default. -To access NVIDIA DCGM Exporter Dashboard `http://localhost:8080/d/Oxed_c6Wz/nvidia-dcgm-exporter-dashboard` +To access the NVIDIA DCGM Exporter dashboard, go to `http://localhost:8080/d/Oxed_c6Wz/nvidia-dcgm-exporter-dashboard` ### Alerting -To enable alert messages for Slack please refer -this [article](https://grafana.com/docs/grafana/latest/alerting/configure-notifications/manage-contact-points/integrations/configure-slack/) +To enable alert messages for Slack, refer to this [article](https://grafana.com/docs/grafana/latest/alerting/configure-notifications/manage-contact-points/integrations/configure-slack/) -## Accessing Storage +## Accessing storage ### Prerequisites: -1. To use csi-driver, it's mandatory to set 'enable_filestore = true' in terraform.tfvars file. -2. Then, the helm release managing this csi-driver is deployed in helm.tf file by applying the module: "csi-mounted-fs-path". -3. Keep in mind that 'csi-mounted-fs-path' module is applying only while instances are in boot process, using the following /nebius-solution-library/modules/cloud-init/k8s-cloud-init.tftpl commands: + +1. To use csi-driver, you must set 'enable_filestore = true' in the `terraform.tfvars` file. +2. Deploy the helm release that manages this csi-driver in the `helm.tf` file by applying the "csi-mounted-fs-path" module. +3. Keep in mind that the 'csi-mounted-fs-path' module can only be applied while instances are booting, using the following /nebius-solution-library/modules/cloud-init/k8s-cloud-init.tftpl commands: ```shell - sudo mkdir -p /mnt/data - sudo mount -t virtiofs data /mnt/data - echo data /mnt/data \"virtiofs\" \"defaults\" \"0\" \"2\" | sudo tee -a /etc/fstab" ``` -### Using mounted storageclass -Using mounted storage requires manually creating Persistent Volumes. Bellow is a template for creating PV and PVC. -Replace `` and `` variables with actual values. +### Using mounted StorageClass + +To use mounted storage, you need to manually create Persistent Volumes (PVs). Use the template below to create a PV and PVC. +Replace `` and `` variables with your specific values. ```yaml kind: PersistentVolume @@ -293,15 +299,12 @@ spec: storage: "" ``` - CSI limitations: -limitations of CSI over mounted FS -FS should be mounted to all NodeGroups, because PV attachmend to pod runniing on Node without FS will fail -One PV may fill up to all common FS size -FS size will not be autoupdated if PV size exceed it spec size -FS size for now can't be updated through API, only through NEBOPS. (thread) +the limitations of CSI over mounted FS: +FS should be mounted to all NodeGroups because a PV attached to a pod running on a node without FS will fail +A single PV can storage up to all common FS sizes +If the PV size exceeds the specified size, the FS size will not be updated automatically +For now the FS size can only be updated via NEBOPS, not the API. (thread) volumeMode: Block - is not possible -Good to know: -read-write many mode PV will work -MSP started testing that solution to enable early integration with mk8s. Hope they will bring feedback soon. +Note: The read-write-many mode PV is already operational. MSP is testing this solution to enable early integration with mk8s. We hope they will give us feedback soon. From 0e47bb9e0cb56ad2d9539e5509b6f917a839a01d Mon Sep 17 00:00:00 2001 From: Irakliy Glunchadze <28791638+iglunchadze@users.noreply.github.com> Date: Mon, 28 Oct 2024 10:20:34 +0100 Subject: [PATCH 03/32] Remove paragraphs from end --- k8s-training/README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/k8s-training/README.md b/k8s-training/README.md index d4e994da..9e171431 100644 --- a/k8s-training/README.md +++ b/k8s-training/README.md @@ -3,7 +3,7 @@ ## Features - Creating a Kubernetes cluster with CPU and GPU nodes. -- Installing the required [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) +- Installing the required [Nvidia Gpu Operator](https://github.com/NVIDIA/gpu-operator) and [Network Operator](https://docs.nvidia.com/networking/display/cokan10/network+operator) for running GPU workloads.- Installing [Grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana). - Installing [Prometheus](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus). @@ -299,12 +299,3 @@ spec: storage: "" ``` -CSI limitations: -the limitations of CSI over mounted FS: -FS should be mounted to all NodeGroups because a PV attached to a pod running on a node without FS will fail -A single PV can storage up to all common FS sizes -If the PV size exceeds the specified size, the FS size will not be updated automatically -For now the FS size can only be updated via NEBOPS, not the API. (thread) -volumeMode: Block - is not possible - -Note: The read-write-many mode PV is already operational. MSP is testing this solution to enable early integration with mk8s. We hope they will give us feedback soon. From 08f669c671312a8bc02fe237f45c0100dc58cdbd Mon Sep 17 00:00:00 2001 From: Cyril Kondratenko Date: Wed, 6 Nov 2024 14:46:09 +0100 Subject: [PATCH 04/32] fix public ip allocation for gpu nodes --- k8s-inference/main.tf | 2 +- k8s-training/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s-inference/main.tf b/k8s-inference/main.tf index 0f9942ef..52041518 100644 --- a/k8s-inference/main.tf +++ b/k8s-inference/main.tf @@ -69,7 +69,7 @@ resource "nebius_mk8s_v1_node_group" "gpu" { network_interfaces = [ { subnet_id = var.subnet_id - public_ip = var.gpu_nodes_assign_public_ip ? {} : null + public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] resources = { diff --git a/k8s-training/main.tf b/k8s-training/main.tf index fa140d14..e877161f 100644 --- a/k8s-training/main.tf +++ b/k8s-training/main.tf @@ -69,7 +69,7 @@ resource "nebius_mk8s_v1_node_group" "gpu" { network_interfaces = [ { subnet_id = var.subnet_id - public_ip = var.gpu_nodes_assign_public_ip ? {} : null + public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] resources = { From 857cb834995a0a903fbc3b80dca173d10449c1f6 Mon Sep 17 00:00:00 2001 From: Idan Belisha Date: Wed, 13 Nov 2024 19:00:59 +0200 Subject: [PATCH 05/32] Adding csi-driver-mounted-fs solution to support k8s-inference + updating README.md for k8s-training and k8s-inference for well known limitations and csi-driver how-to-use guide. --- k8s-inference/README.md | 17 ++++++++++++++--- k8s-inference/helm.tf | 5 +++++ k8s-training/README.md | 21 ++++++++++----------- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/k8s-inference/README.md b/k8s-inference/README.md index 020f9941..a57220b4 100644 --- a/k8s-inference/README.md +++ b/k8s-inference/README.md @@ -266,13 +266,13 @@ apiVersion: v1 metadata: name: external-storage-persistent-volume spec: - storageClassName: hostpath + storageClassName: csi-mounted-fs-path-sc capacity: storage: "" accessModes: - ReadWriteMany hostPath: - path: "" # "/mnt/filestore/" or "/mnt/glusterfs/" + path: "" # "/mnt/data/" or "/mnt/glusterfs/" --- @@ -281,10 +281,21 @@ apiVersion: v1 metadata: name: external-storage-persistent-volumeclaim spec: - storageClassName: hostpath + storageClassName: csi-mounted-fs-path-sc accessModes: - ReadWriteMany resources: requests: storage: "" ``` + +## CSI limitations: +- FS should be mounted to all NodeGroups, because PV attachmend to pod runniing on Node without FS will fail +- One PV may fill up to all common FS size +- FS size will not be autoupdated if PV size exceed it spec size +- FS size for now can't be updated through API, only through NEBOPS. (thread) +- volumeMode: Block - is not possible + +## Good to know: +- read-write many mode PV will work +- MSP started testing that solution to enable early integration with mk8s. diff --git a/k8s-inference/helm.tf b/k8s-inference/helm.tf index 283ea517..41ca13e2 100644 --- a/k8s-inference/helm.tf +++ b/k8s-inference/helm.tf @@ -39,3 +39,8 @@ module "o11y" { } test_mode = var.test_mode } + +module "csi-mounted-fs-path" { + source = "../modules/csi-mounted-fs-path" + count = var.enable_filestore ? 1 : 0 +} diff --git a/k8s-training/README.md b/k8s-training/README.md index 711f73dc..c359e6db 100644 --- a/k8s-training/README.md +++ b/k8s-training/README.md @@ -288,14 +288,13 @@ spec: ``` -CSI limitations: -limitations of CSI over mounted FS -FS should be mounted to all NodeGroups, because PV attachmend to pod runniing on Node without FS will fail -One PV may fill up to all common FS size -FS size will not be autoupdated if PV size exceed it spec size -FS size for now can't be updated through API, only through NEBOPS. (thread) -volumeMode: Block - is not possible - -Good to know: -read-write many mode PV will work -MSP started testing that solution to enable early integration with mk8s. Hope they will bring feedback soon. +## CSI limitations: +- FS should be mounted to all NodeGroups, because PV attachmend to pod runniing on Node without FS will fail +- One PV may fill up to all common FS size +- FS size will not be autoupdated if PV size exceed it spec size +- FS size for now can't be updated through API, only through NEBOPS. (thread) +- volumeMode: Block - is not possible + +## Good to know: +- read-write many mode PV will work +- MSP started testing that solution to enable early integration with mk8s. From bd44e6c1d0406aca9812569574a0bfc99e62de2e Mon Sep 17 00:00:00 2001 From: rdjjke Date: Thu, 14 Nov 2024 14:00:07 +0000 Subject: [PATCH 06/32] MSP-3313: Add NSYS profiling in GPT3 mlperf implementation --- .../mlperf/gpt3-impl-4.0-nvidia/Dockerfile | 15 +++++++++++ soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION | 2 +- soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub | 1 + .../mlperf/gpt3-impl-4.0-nvidia/start.sh | 25 ++++++++++++++++--- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile b/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile index 9d1f22ab..11377040 100644 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile @@ -121,6 +121,21 @@ RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \ RUN pip install huggingface_hub==0.23.2 RUN pip install -v "transformers<=4.40.2" +## Reinstall NCCL to the latest version +#RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +#RUN dpkg -i cuda-keyring_1.1-1_all.deb +#RUN apt-get update +#RUN apt install libnccl2=2.23.4-1+cuda12.4 libnccl-dev=2.23.4-1+cuda12.4 + +## Install NCCL profiler plugin +#RUN git clone https://github.com/NVIDIA/nccl && \ +# cd nccl && \ +# git checkout v2.23.4-1 && \ +# cd ext-profiler/example && \ +# make && \ +# cp libnccl-profiler.so /usr/lib/x86_64-linux-gnu/ + + # Benchmark code WORKDIR /workspace/llm diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION b/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION index 9c38d380..e2c2ff71 100644 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION @@ -1 +1 @@ -4.0-16 +4.0-20 diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub b/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub index b577694f..43b59658 100755 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub @@ -235,6 +235,7 @@ cleanup_preload_shared() { if [ -n "${CONTAINER_PRELOAD_SHARED_PATH}" ]; then CONT_FILE="${CONTAINER_PRELOAD_SHARED_PATH}/containers/${SLURM_JOBID}_$(basename ${CONT}).squashfs" # Prepull container image to the shared filesystem + mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers" srun --ntasks=1 enroot import --output ${CONT_FILE} docker://${CONT} else CONT_FILE=${CONT} diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh index 7b66f445..acd5bafc 100755 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh @@ -6,7 +6,7 @@ usage() { echo "usage: ${0} -N [-w ] [-c ]" >&2 echo " [-e ]" >&2 echo " [-i ] [-D ] [-C ] [-R ] [-S ]" >&2 - echo " [-q (quick_start)] [-r (remove_prev_logs)] [-d (debug)] [-h (help)]" >&2 + echo " [-q (quick_start)] [-r (remove_prev_logs)] [-d (debug)] [-p (nsys_profiling)] [-h (help)]" >&2 exit 1 } @@ -16,7 +16,7 @@ dataset_dir="/mlperf-data/gpt3-dataset-4.0" checkpoint_dir="/mlperf-data/gpt3-checkpoint-4.0" result_dir="./result" -while getopts N:w:c:e:i:D:C:R:S:qrdh flag +while getopts N:w:c:e:i:D:C:R:S:qrdph flag do case "${flag}" in N) nodes=${OPTARG};; @@ -31,6 +31,7 @@ do q) quick_start=1;; r) rmlogs=1;; d) debug=1;; + p) nsys_profiling=1;; h) usage;; *) usage;; esac @@ -116,6 +117,18 @@ if [[ $debug -eq 1 ]]; then export GDRCOPY_LOG_LEVEL=1 fi +if [[ $nsys_profiling -eq 1 ]]; then + # Configure NSYS profiler + export NVTX_FLAG=1 + export PROFILE=True + export PROFILE_START_STEP=10 + export PROFILE_END_STEP=11 + export PROFILE_RANKS="0,1,2,3,4,5,6,7" + + # Early stopping: + export TARGET_LOG_PPL=2.75 +fi + if [ -z "${experiment}" ]; then job_name="gpt3" job_output="gpt3-%j.out" @@ -124,14 +137,18 @@ else job_output="gpt3-%j-${experiment}.out" fi +node_allocation="--nodes=${nodes}" +if [ -n "${nodelist}" ]; then + node_allocation="--nodelist='${nodelist}'" +fi + echo "Submit Slurm job" sbatch \ -t $WALLTIME \ -J "${job_name}" \ --output="${job_output}" \ --export=ALL \ - --nodes="${nodes}" \ - --nodelist="${nodelist}" \ + ${node_allocation} \ --ntasks-per-node="${SBATCH_GPUS_PER_NODE}" \ ${EXCLUSIVE:+--exclusive} \ run.sub From 197a5f5aa6f305b9579bbfdad06297bd173064c3 Mon Sep 17 00:00:00 2001 From: rdjjke Date: Sun, 17 Nov 2024 17:27:07 +0000 Subject: [PATCH 07/32] Add configs for H200 nodes to GPT3 impl --- ...ODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh | 86 +++++++++++++++++++ .../config_H200x8_NODEx64_default.sh | 1 + ...NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh | 86 +++++++++++++++++++ ...ODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh | 86 +++++++++++++++++++ .../config_H200x8_NODEx8_default.sh | 1 + 5 files changed, 260 insertions(+) create mode 100644 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh create mode 120000 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh create mode 100644 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh create mode 100644 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh create mode 120000 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh new file mode 100644 index 00000000..fb48cf75 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# DL params +export DGXNNODES="${DGXNNODES:=64}" # NODEx64 +export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=2}" # TPx2 (training.model.tensor_model_parallel_size) +export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) +export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 +export MINIBS="${MINIBS:=128}" # MINBSx128 +export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2 + +# Check DL params +# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 +# This simplifies to MINIBS % PP == 0 +if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then + echo "MINIBS should be divisble by PP" + exit 1 +fi + + + +# Slurm resource allocation +export SBATCH_GPUS_PER_NODE="8" +export SBATCH_MEM_PER_NODE="1200G" +export SBATCH_TRES_PER_TASK="cpu=16" +export SBATCH_DISTRIBUTION="block:block:block" +export SLURM_CPU_BIND="verbose,none" +#export EXCLUSIVE=1 + +# Use bindpcie CPU pinning +export ENABLE_CPU_EXCLUSIVE=1 +export ENABLE_IB_BINDING=1 + + + + +# Job time limit +export WALLTIME_MINUTES=1200 +export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) + + + +# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). +export TP_COMM_OVERLAP=True + +# Execute of nvidia-smi boost-slider --vboost +export VBOOST_VALUE=1 + +# Set MaxQ and MinEDP clocks +export SET_MAXQ_CLK=0 +export MAXQ_CLK="" +export SET_MINEDP_CLK=0 +export MINEDP_CLK="" + +# Set power limit +export SET_POWER_CAP=0 +export POWER_CAP="" + +# Use CPU offloading (activations & weights). +export CPU_OFFLOADING=False + +# Load the minimal number of samples +export LOAD_MINIMAL_NUM_SAMPLES=0 + +# Load distributed checkpoint directly on GPU +export LOAD_DIRECTLY_ON_DEVICE=0 + + + +# Extract system name +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) + + + +# Configure mlperf SYSJSON logging +export MLPERF_SUBMITTER="Nebius" +export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" +export MLPERF_STATUS="cloud" + + + +# Apply common settings +source $(dirname ${BASH_SOURCE[0]})/config_common.sh + +# Apply FP8 settings +source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh + diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh new file mode 120000 index 00000000..bbe6159f --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh @@ -0,0 +1 @@ +config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh \ No newline at end of file diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh new file mode 100644 index 00000000..b006bdd6 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# DL params +export DGXNNODES="${DGXNNODES:=8}" # NODEx8 +export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=4}" # TPx4 (training.model.tensor_model_parallel_size) +export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) +export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 +export MINIBS="${MINIBS:=128}" # MINBSx128 +export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2 + +# Check DL params +# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 +# This simplifies to MINIBS % PP == 0 +if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then + echo "MINIBS should be divisble by PP" + exit 1 +fi + + + +# Slurm resource allocation +export SBATCH_GPUS_PER_NODE="8" +export SBATCH_MEM_PER_NODE="1200G" +export SBATCH_TRES_PER_TASK="cpu=16" +export SBATCH_DISTRIBUTION="block:block:block" +export SLURM_CPU_BIND="verbose,none" +#export EXCLUSIVE=1 + +# Use bindpcie CPU pinning +export ENABLE_CPU_EXCLUSIVE=1 +export ENABLE_IB_BINDING=1 + + + + +# Job time limit +export WALLTIME_MINUTES=1200 +export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) + + + +# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). +export TP_COMM_OVERLAP=True + +# Execute of nvidia-smi boost-slider --vboost +export VBOOST_VALUE=1 + +# Set MaxQ and MinEDP clocks +export SET_MAXQ_CLK=0 +export MAXQ_CLK="" +export SET_MINEDP_CLK=0 +export MINEDP_CLK="" + +# Set power limit +export SET_POWER_CAP=0 +export POWER_CAP="" + +# Use CPU offloading (activations & weights). +export CPU_OFFLOADING=False + +# Load the minimal number of samples +export LOAD_MINIMAL_NUM_SAMPLES=0 + +# Load distributed checkpoint directly on GPU +export LOAD_DIRECTLY_ON_DEVICE=0 + + + +# Extract system name +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) + + + +# Configure mlperf SYSJSON logging +export MLPERF_SUBMITTER="Nebius" +export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" +export MLPERF_STATUS="cloud" + + + +# Apply common settings +source $(dirname ${BASH_SOURCE[0]})/config_common.sh + +# Apply FP8 settings +source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh + diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh new file mode 100644 index 00000000..c8109ba5 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# DL params +export DGXNNODES="${DGXNNODES:=8}" # NODEx8 +export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=8}" # TPx8 (training.model.tensor_model_parallel_size) +export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) +export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 +export MINIBS="${MINIBS:=3072}" # MINBSx3072 +export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=1}" # MICBSx1 + +# Check DL params +# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 +# This simplifies to MINIBS % PP == 0 +if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then + echo "MINIBS should be divisble by PP" + exit 1 +fi + + + +# Slurm resource allocation +export SBATCH_GPUS_PER_NODE="8" +export SBATCH_MEM_PER_NODE="1200G" +export SBATCH_TRES_PER_TASK="cpu=16" +export SBATCH_DISTRIBUTION="block:block:block" +export SLURM_CPU_BIND="verbose,none" +#export EXCLUSIVE=1 + +# Use bindpcie CPU pinning +export ENABLE_CPU_EXCLUSIVE=1 +export ENABLE_IB_BINDING=1 + + + + +# Job time limit +export WALLTIME_MINUTES=1200 +export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) + + + +# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). +export TP_COMM_OVERLAP=True + +# Execute of nvidia-smi boost-slider --vboost +export VBOOST_VALUE=1 + +# Set MaxQ and MinEDP clocks +export SET_MAXQ_CLK=0 +export MAXQ_CLK="" +export SET_MINEDP_CLK=0 +export MINEDP_CLK="" + +# Set power limit +export SET_POWER_CAP=0 +export POWER_CAP="" + +# Use CPU offloading (activations & weights). +export CPU_OFFLOADING=False + +# Load the minimal number of samples +export LOAD_MINIMAL_NUM_SAMPLES=0 + +# Load distributed checkpoint directly on GPU +export LOAD_DIRECTLY_ON_DEVICE=0 + + + +# Extract system name +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) + + + +# Configure mlperf SYSJSON logging +export MLPERF_SUBMITTER="Nebius" +export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" +export MLPERF_STATUS="cloud" + + + +# Apply common settings +source $(dirname ${BASH_SOURCE[0]})/config_common.sh + +# Apply FP8 settings +source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh + diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh new file mode 120000 index 00000000..4fef3305 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh @@ -0,0 +1 @@ +config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh \ No newline at end of file From f83971bd7127a733c61192db5a614c7932b49005 Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Wed, 20 Nov 2024 15:51:03 +0100 Subject: [PATCH 08/32] nccl_use_infiniband true and nccl_benchmark_min_threshold 45 --- soperator/installations/example/main.tf | 1 + soperator/installations/example/terraform.tfvars | 4 ++-- soperator/installations/example/variables.tf | 10 ++++++++-- soperator/modules/slurm/main.tf | 7 ++++--- .../templates/helm_values/slurm_cluster.yaml.tftpl | 1 + soperator/modules/slurm/variables.tf | 8 +++++++- 6 files changed, 23 insertions(+), 8 deletions(-) diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index a6d3993a..59c57207 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -255,6 +255,7 @@ module "slurm" { nccl_benchmark_enable = var.nccl_benchmark_enable nccl_benchmark_schedule = var.nccl_benchmark_schedule nccl_benchmark_min_threshold = var.nccl_benchmark_min_threshold + nccl_use_infiniband = var.nccl_use_infiniband telemetry_enabled = var.telemetry_enabled telemetry_grafana_admin_password = var.telemetry_grafana_admin_password diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 408ac7b5..77b4ed8f 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -357,9 +357,9 @@ slurm_shared_memory_size_gibibytes = 256 # nccl_benchmark_enable = "0 */3 * * *" # Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable. -# By default, 420. +# By default, 45. # --- -# nccl_benchmark_min_threshold = 420 +# nccl_benchmark_min_threshold = 45 # endregion NCCL benchmark diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index b6fa95f5..b1a9a480 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -504,10 +504,16 @@ variable "nccl_benchmark_schedule" { variable "nccl_benchmark_min_threshold" { description = "Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable." type = number - default = 420 + default = 45 } -# region NCCL benchmark +variable "nccl_use_infiniband" { + description = "Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test." + type = bool + default = true +} + +# endregion NCCL benchmark # region Telemetry diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index 02670d78..d52d99e6 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -163,9 +163,10 @@ resource "helm_release" "slurm_cluster" { nccl_topology_type = var.nccl_topology_type nccl_benchmark = { - enable = var.nccl_benchmark_enable - schedule = var.nccl_benchmark_schedule - min_threshold = var.nccl_benchmark_min_threshold + enable = var.nccl_benchmark_enable + schedule = var.nccl_benchmark_schedule + min_threshold = var.nccl_benchmark_min_threshold + use_infiniband = var.nccl_use_infiniband } nodes = { diff --git a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl index 66457052..c420d949 100644 --- a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl @@ -128,6 +128,7 @@ periodicChecks: schedule: "${nccl_benchmark.schedule}" ncclArguments: thresholdMoreThan: ${nccl_benchmark.min_threshold} + useInfiniband: ${nccl_benchmark.use_infiniband} slurmNodes: accounting: diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index 2eef573b..c9db7d15 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -206,7 +206,13 @@ variable "nccl_benchmark_schedule" { variable "nccl_benchmark_min_threshold" { description = "Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable." type = number - default = 420 + default = 45 +} + +variable "nccl_use_infiniband" { + description = "Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test." + type = bool + default = true } # endregion NCCL benchmark From 212e487dde5522c1b37e4f8e16d684e6c7e14a20 Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Wed, 20 Nov 2024 16:36:37 +0100 Subject: [PATCH 09/32] add nccl_use_infiniband to example --- soperator/installations/example/terraform.tfvars | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 77b4ed8f..2355dd0a 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -361,6 +361,11 @@ slurm_shared_memory_size_gibibytes = 256 # --- # nccl_benchmark_min_threshold = 45 +# Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test. +# By default, true +# --- +# nccl_use_infiniband = true + # endregion NCCL benchmark #----------------------------------------------------------------------------------------------------------------------# From 0ca10bf6e11c465ea764d7eb7f2ca9d05e806d55 Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Wed, 20 Nov 2024 17:02:04 +0100 Subject: [PATCH 10/32] bump soperator 1.15.3 --- soperator/VERSION | 2 +- soperator/installations/example/terraform.tfvars | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/soperator/VERSION b/soperator/VERSION index 42cf0675..f2380cc7 100644 --- a/soperator/VERSION +++ b/soperator/VERSION @@ -1 +1 @@ -1.15.2 +1.15.3 diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 2355dd0a..a740d787 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -168,7 +168,7 @@ slurm_cluster_name = "my-amazing-slurm" # Version of soperator. # --- -slurm_operator_version = "1.15.2" +slurm_operator_version = "1.15.3" # Type of the Slurm partition config. Could be either `default` or `custom`. # By default, "default". From 8791f52ef466ce1b439490fb5abfe11304f0266b Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 12:51:21 +0200 Subject: [PATCH 11/32] Platform and preset moved to variables across the library; --- k8s-inference/terraform.tfvars | 2 ++ k8s-training/terraform.tfvars | 3 +++ slurm/slurm-master.tf | 4 ++-- slurm/slurm-worker.tf | 4 ++-- slurm/terraform.tfvars | 5 +++++ slurm/variables.tf | 23 ++++++++++++++++++++--- wireguard/main.tf | 4 ++-- wireguard/terraform.tfvars | 3 +++ wireguard/variables.tf | 16 +++++++++++++++- 9 files changed, 54 insertions(+), 10 deletions(-) diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 7ff9869d..00ac6317 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -10,7 +10,9 @@ # K8s modes cpu_nodes_count = 1 # Number of CPU nodes cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset +cpu_nodes_platform = "cpu-e2" # The CPU node platform gpu_nodes_count = 1 # Number of GPU nodes +gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs. # Observability diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index 5392c93f..63eb2d40 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -9,9 +9,12 @@ # K8s modes cpu_nodes_count = 1 # Number of CPU nodes +cpu_nodes_platform = "cpu-e2" # The CPU node platform cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset gpu_nodes_count = 1 # Number of GPU nodes +gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection +infiniband_fabric = "fabric-3" # Infiniband fabric name. # Observability diff --git a/slurm/slurm-master.tf b/slurm/slurm-master.tf index 26f3075b..cc34894f 100644 --- a/slurm/slurm-master.tf +++ b/slurm/slurm-master.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "master" { name = "slurm-master" parent_id = var.parent_id resources = { - platform = "cpu-e2" - preset = "4vcpu-16gb" + platform = var.master_platform + preset = var.master_preset } boot_disk = { attach_mode = "READ_WRITE" diff --git a/slurm/slurm-worker.tf b/slurm/slurm-worker.tf index c1825c4f..21317552 100644 --- a/slurm/slurm-worker.tf +++ b/slurm/slurm-worker.tf @@ -29,8 +29,8 @@ resource "nebius_compute_v1_instance" "worker" { name = each.key parent_id = var.parent_id resources = { - platform = "gpu-h100-sxm" - preset = "8gpu-128vcpu-1600gb" + platform = var.worker_platform + preset = var.worker_preset } gpu_cluster = nebius_compute_v1_gpu_cluster.gpu-cluster-slurm diff --git a/slurm/terraform.tfvars b/slurm/terraform.tfvars index 1858faef..5db8a9f0 100644 --- a/slurm/terraform.tfvars +++ b/slurm/terraform.tfvars @@ -7,3 +7,8 @@ shared_fs_type = "filesystem" # "nfs" or "filesystem" # key = "put your public ssh key here" # path = "put path to ssh key here" # } + +master_platform = "cpu-e2" +master_preset = "4vcpu-16gb" +worker_platform = "gpu-h100-sxm" +worker_preset = "8gpu-128vcpu-1600gb" \ No newline at end of file diff --git a/slurm/variables.tf b/slurm/variables.tf index 565a04fc..8a957eb7 100644 --- a/slurm/variables.tf +++ b/slurm/variables.tf @@ -29,12 +29,29 @@ variable "ssh_public_key" { } } -variable "platform_id" { +variable "master_platform" { + description = "Platform for Slurm Master." type = string - description = "Platform for workers: gpu-h100-b for Inspur or gpu-h100 for Gigabyte" - default = "gpu-h100-b" + default = "cpu-e2" } +variable "master_preset" { + description = "Preset for Slurm Master." + type = string + default = "4vcpu-16gb" +} + +variable "worker_platform" { + description = "Platform for Slurm Worker." + type = string + default = "gpu-h100-sxm" +} + +variable "worker_preset" { + description = "Preset for Slurm Worker." + type = string + default = "8gpu-128vcpu-1600gb" +} variable "mysql_jobs_backend" { type = bool diff --git a/wireguard/main.tf b/wireguard/main.tf index 4bc7ba4d..ebc8e482 100644 --- a/wireguard/main.tf +++ b/wireguard/main.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "wireguard_instance" { ] resources = { - platform = "cpu-e2" - preset = "4vcpu-16gb" + platform = var.platform + preset = var.preset } diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 38986d48..76deadb8 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -6,3 +6,6 @@ # path = "put path to ssh key here" # } # public_ip_allocation_id = "" + +platform = cpu-e2 +preset = "4vcpu-16gb" \ No newline at end of file diff --git a/wireguard/variables.tf b/wireguard/variables.tf index 8d14ea3b..e4d74af9 100644 --- a/wireguard/variables.tf +++ b/wireguard/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global parameters variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,20 @@ variable "subnet_id" { type = string } + +# Platform +variable "platform" { + description = "Platform for WireGuard host." + type = string + default = "cpu-e2" +} + +variable "preset" { + description = "Preset for WireGuard host." + type = string + default = "4vcpu-16gb" +} + # SSH access variable "ssh_user_name" { description = "SSH username." From 04ab264986e2c20b8dd4c6eab8679bd1bc4f58bc Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 14:31:43 +0200 Subject: [PATCH 12/32] Added "region" variable to control platform defaults (k8s-inference); --- k8s-inference/README.md | 1 + k8s-inference/helm.tf | 2 +- k8s-inference/locals.tf | 22 ++++++++++++++++++++++ k8s-inference/main.tf | 8 ++++---- k8s-inference/terraform.tfvars | 7 ++----- k8s-inference/variables.tf | 16 +++++++++++----- 6 files changed, 41 insertions(+), 15 deletions(-) diff --git a/k8s-inference/README.md b/k8s-inference/README.md index a57220b4..42bfb756 100644 --- a/k8s-inference/README.md +++ b/k8s-inference/README.md @@ -75,6 +75,7 @@ There are additional configurable variables in `variables.tf`. # Cloud environment and network parent_id = "" # The project-id in this context subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +region = "" # The project region. ssh_user_name = "" # Username you want to use to connect to the nodes ssh_public_key = { key = "put your public ssh key here" OR diff --git a/k8s-inference/helm.tf b/k8s-inference/helm.tf index 41ca13e2..1e09c23b 100644 --- a/k8s-inference/helm.tf +++ b/k8s-inference/helm.tf @@ -30,7 +30,7 @@ module "o11y" { enabled = var.enable_dcgm, node_groups = { node_group_name = { - gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0]) + gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0]) instance_group_id = nebius_mk8s_v1_node_group.gpu.id } } diff --git a/k8s-inference/locals.tf b/k8s-inference/locals.tf index a4fb0a61..846e8972 100644 --- a/k8s-inference/locals.tf +++ b/k8s-inference/locals.tf @@ -2,6 +2,28 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + } + eu-north1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset) } resource "random_string" "random" { diff --git a/k8s-inference/main.tf b/k8s-inference/main.tf index 52041518..c2595ef8 100644 --- a/k8s-inference/main.tf +++ b/k8s-inference/main.tf @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" { } ] resources = { - platform = var.cpu_nodes_platform - preset = var.cpu_nodes_preset + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } filesystems = var.enable_filestore ? [ { @@ -73,8 +73,8 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } ] resources = { - platform = var.gpu_nodes_platform - preset = var.gpu_nodes_preset + platform = local.gpu_nodes_platform + preset = local.gpu_nodes_preset } filesystems = var.enable_filestore ? [ { diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 00ac6317..4250a903 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -1,19 +1,16 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" # } -# K8s modes +# K8s nodes cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset -cpu_nodes_platform = "cpu-e2" # The CPU node platform gpu_nodes_count = 1 # Number of GPU nodes -gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform -gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs. # Observability enable_grafana = true # Enable or disable Grafana deployment with true or false diff --git a/k8s-inference/variables.tf b/k8s-inference/variables.tf index ec1a994d..1087a015 100644 --- a/k8s-inference/variables.tf +++ b/k8s-inference/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,12 @@ variable "subnet_id" { type = string } +variable "region" { + description = "The current region." + type = string +} + +# K8s cluster variable "k8s_version" { description = "Kubernetes version to be used in the cluster." type = string @@ -114,13 +120,13 @@ variable "cpu_nodes_count" { variable "cpu_nodes_platform" { description = "Platform for nodes in the CPU-only node group." type = string - default = "cpu-e2" + default = null } variable "cpu_nodes_preset" { description = "CPU and RAM configuration for nodes in the CPU-only node group." type = string - default = "16vcpu-64gb" + default = null } variable "cpu_disk_type" { @@ -145,13 +151,13 @@ variable "gpu_nodes_count" { variable "gpu_nodes_platform" { description = "Platform for nodes in the GPU node group." type = string - default = "gpu-h100-sxm" + default = null } variable "gpu_nodes_preset" { description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group." type = string - default = "1gpu-16vcpu-200gb" + default = null } variable "gpu_disk_type" { From 5eed389c407bc23f876fff18c530abc5e4f57d60 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:12:23 +0200 Subject: [PATCH 13/32] Added "region" variable to control platform defaults (k8s-inference (2)); --- k8s-inference/locals.tf | 6 +++--- k8s-inference/terraform.tfvars | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/k8s-inference/locals.tf b/k8s-inference/locals.tf index 846e8972..b1c34f2c 100644 --- a/k8s-inference/locals.tf +++ b/k8s-inference/locals.tf @@ -5,13 +5,13 @@ locals { regions_default = { eu-west1 = { - cpu_nodes_platform = "cpu-e2" + cpu_nodes_platform = "cpu-d3" cpu_nodes_preset = "16vcpu-64gb" - gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_platform = "gpu-h200-sxm" gpu_nodes_preset = "1gpu-16vcpu-200gb" } eu-north1 = { - cpu_nodes_platform = "cpu-d3" + cpu_nodes_platform = "cpu-e2" cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h100-sxm" gpu_nodes_preset = "1gpu-16vcpu-200gb" diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 4250a903..7df94830 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -11,6 +11,10 @@ # K8s nodes cpu_nodes_count = 1 # Number of CPU nodes gpu_nodes_count = 1 # Number of GPU nodes +# cpu_nodes_platform = # CPU nodes platofm +# cpu_nodes_preset = # CPU nodes preset +# gpu_nodes_platform = # GPU nodes platform +# gpu_nodes_preset = # GPU nodes preset # Observability enable_grafana = true # Enable or disable Grafana deployment with true or false From 1984342fa7346043f78fae0446555af50abc7377 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:25:03 +0200 Subject: [PATCH 14/32] Added "region" variable to control platform defaults (k8s-inference (3)); --- k8s-inference/terraform.tfvars | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 7df94830..c38651d3 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -1,8 +1,8 @@ # Cloud environment and network -# parent_id = "" # The project-id in this context -# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id # region = "" -# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" From 3c8423a4be97b475abbed8e766cc5a8488c0dd49 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:25:16 +0200 Subject: [PATCH 15/32] Added "region" variable to control platform defaults (k8s-training); --- k8s-training/README.md | 1 + k8s-training/gpu_cluster.tf | 4 ++-- k8s-training/helm.tf | 2 +- k8s-training/locals.tf | 25 +++++++++++++++++++++++++ k8s-training/main.tf | 8 ++++---- k8s-training/terraform.tfvars | 17 +++++++++-------- k8s-training/variables.tf | 18 ++++++++++++------ 7 files changed, 54 insertions(+), 21 deletions(-) diff --git a/k8s-training/README.md b/k8s-training/README.md index 7de0fd2a..1c62e18a 100644 --- a/k8s-training/README.md +++ b/k8s-training/README.md @@ -84,6 +84,7 @@ Additional configurable variables can be found in the `variables.tf` file. # Cloud environment and network parent_id = "" # The project-id in this context subnet_id = "" # Run the `nebius vpc v1alpha1 network list` command to see the subnet id +region = "" # The project region ssh_user_name = "" # Username you want to use to connect to the nodes ssh_public_key = { key = "Enter your public SSH key here" OR diff --git a/k8s-training/gpu_cluster.tf b/k8s-training/gpu_cluster.tf index 89cce8c5..472d950f 100644 --- a/k8s-training/gpu_cluster.tf +++ b/k8s-training/gpu_cluster.tf @@ -1,5 +1,5 @@ resource "nebius_compute_v1_gpu_cluster" "fabric_2" { - infiniband_fabric = var.infiniband_fabric + infiniband_fabric = local.infiniband_fabric parent_id = var.parent_id - name = join("-", [var.infiniband_fabric, local.release-suffix]) + name = join("-", [local.infiniband_fabric, local.release-suffix]) } diff --git a/k8s-training/helm.tf b/k8s-training/helm.tf index 1bf3755f..6bceeef5 100644 --- a/k8s-training/helm.tf +++ b/k8s-training/helm.tf @@ -39,7 +39,7 @@ module "o11y" { enabled = var.enable_dcgm, node_groups = { node_group_name = { - gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0]) + gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0]) instance_group_id = nebius_mk8s_v1_node_group.gpu.id } } diff --git a/k8s-training/locals.tf b/k8s-training/locals.tf index a4fb0a61..32dd99c5 100644 --- a/k8s-training/locals.tf +++ b/k8s-training/locals.tf @@ -2,6 +2,31 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + infiniband_fabric = "fabric-5" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + infiniband_fabric = "fabric-3" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset) + infiniband_fabric = coalesce(var.infiniband_fabric, local.current_region_defaults.infiniband_fabric) } resource "random_string" "random" { diff --git a/k8s-training/main.tf b/k8s-training/main.tf index e877161f..a5b27e05 100644 --- a/k8s-training/main.tf +++ b/k8s-training/main.tf @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" { } ] resources = { - platform = var.cpu_nodes_platform - preset = var.cpu_nodes_preset + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } filesystems = var.enable_filestore ? [ { @@ -73,8 +73,8 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } ] resources = { - platform = var.gpu_nodes_platform - preset = var.gpu_nodes_preset + platform = local.gpu_nodes_platform + preset = local.gpu_nodes_preset } filesystems = var.enable_filestore ? [ { diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index 63eb2d40..8d650ae6 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -1,20 +1,21 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" # } -# K8s modes -cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_platform = "cpu-e2" # The CPU node platform -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset -gpu_nodes_count = 1 # Number of GPU nodes -gpu_nodes_platform = "gpu-h100-sxm" # The GPU node platform -gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection -infiniband_fabric = "fabric-3" # Infiniband fabric name. +# K8s nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes +# cpu_nodes_platform = # CPU nodes platofm +# cpu_nodes_preset = # CPU nodes preset +# gpu_nodes_platform = # GPU nodes platform +# gpu_nodes_preset = # GPU nodes preset +# infiniband_fabric = # Infiniband fabric name. # Observability diff --git a/k8s-training/variables.tf b/k8s-training/variables.tf index b17ba14b..65f6b71c 100644 --- a/k8s-training/variables.tf +++ b/k8s-training/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,12 @@ variable "subnet_id" { type = string } +variable "region" { + description = "The current region." + type = string +} + +# K8s cluster variable "k8s_version" { description = "Kubernetes version to be used in the cluster." type = string @@ -114,13 +120,13 @@ variable "cpu_nodes_count" { variable "cpu_nodes_platform" { description = "Platform for nodes in the CPU-only node group." type = string - default = "cpu-e2" + default = null } variable "cpu_nodes_preset" { description = "CPU and RAM configuration for nodes in the CPU-only node group." type = string - default = "16vcpu-64gb" + default = null } variable "cpu_disk_type" { @@ -145,13 +151,13 @@ variable "gpu_nodes_count" { variable "gpu_nodes_platform" { description = "Platform for nodes in the GPU node group." type = string - default = "gpu-h100-sxm" + default = null } variable "gpu_nodes_preset" { description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group." type = string - default = "8gpu-128vcpu-1600gb" + default = null } variable "gpu_disk_type" { @@ -169,7 +175,7 @@ variable "gpu_disk_size" { variable "infiniband_fabric" { description = "Infiniband's fabric name." type = string - default = "fabric-3" + default = null } variable "gpu_nodes_assign_public_ip" { From ab7b2deef6784bc801f1210d2e3b9fc6f396316d Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:32:49 +0200 Subject: [PATCH 16/32] Added "region" variable to control platform defaults (GlusterFS module); --- k8s-inference/gluster-fs.tf | 2 ++ k8s-training/gluster-fs.tf | 2 ++ 2 files changed, 4 insertions(+) diff --git a/k8s-inference/gluster-fs.tf b/k8s-inference/gluster-fs.tf index 6a87b0e8..8d800f53 100644 --- a/k8s-inference/gluster-fs.tf +++ b/k8s-inference/gluster-fs.tf @@ -7,4 +7,6 @@ module "glusterfs" { disk_count_per_vm = var.glusterfs_disk_count_per_vm disk_size = var.glusterfs_disk_size ssh_public_key = local.ssh_public_key + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/k8s-training/gluster-fs.tf b/k8s-training/gluster-fs.tf index 6a87b0e8..8d800f53 100644 --- a/k8s-training/gluster-fs.tf +++ b/k8s-training/gluster-fs.tf @@ -7,4 +7,6 @@ module "glusterfs" { disk_count_per_vm = var.glusterfs_disk_count_per_vm disk_size = var.glusterfs_disk_size ssh_public_key = local.ssh_public_key + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } From 5216c7a2ab4ac5356095e0f35c0b8ef8177e9f6f Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:38:19 +0200 Subject: [PATCH 17/32] Added "region" variable to control platform defaults (NFS Server); --- nfs-server/locals.tf | 16 ++++++++++++++++ nfs-server/main.tf | 2 ++ nfs-server/{nfs.tfvars => terraform.tfvars} | 1 + nfs-server/variables.tf | 17 +++++++++++++++++ 4 files changed, 36 insertions(+) rename nfs-server/{nfs.tfvars => terraform.tfvars} (88%) diff --git a/nfs-server/locals.tf b/nfs-server/locals.tf index ed79b470..2036a642 100644 --- a/nfs-server/locals.tf +++ b/nfs-server/locals.tf @@ -1,4 +1,20 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) } diff --git a/nfs-server/main.tf b/nfs-server/main.tf index 10d495ac..39085d14 100644 --- a/nfs-server/main.tf +++ b/nfs-server/main.tf @@ -9,4 +9,6 @@ module "nfs-module" { ssh_public_key = var.ssh_public_key.key nfs_ip_range = var.nfs_ip_range nfs_size = var.nfs_size + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/nfs-server/nfs.tfvars b/nfs-server/terraform.tfvars similarity index 88% rename from nfs-server/nfs.tfvars rename to nfs-server/terraform.tfvars index 275cf081..d1f874e1 100644 --- a/nfs-server/nfs.tfvars +++ b/nfs-server/terraform.tfvars @@ -1,5 +1,6 @@ parent_id = "project-..." subnet_id = "vpcsubnet-..." +region = "eu-north1" ssh_user_name = "nfs" ssh_public_key = { key = "put your ssh key here" diff --git a/nfs-server/variables.tf b/nfs-server/variables.tf index 369cefba..d04c7ed3 100644 --- a/nfs-server/variables.tf +++ b/nfs-server/variables.tf @@ -8,6 +8,23 @@ variable "subnet_id" { description = "ID of the subnet." } +variable "region" { + type = string + description = "Project region." +} + +variable "cpu_nodes_platform" { + description = "Platform for instances." + type = string + default = null +} + +variable "cpu_nodes_preset" { + description = "CPU and RAM configuration for instances." + type = string + default = null +} + variable "nfs_size" { type = number default = 93 * 1024 * 1024 * 1024 # size should be a multiple of 99857989632 From e704aad4e9fb946b4e02d38c6163dbbe10a23705 Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 14:44:31 +0100 Subject: [PATCH 18/32] Tf fmt --- k8s-inference/locals.tf | 2 +- k8s-inference/main.tf | 2 +- k8s-inference/terraform.tfvars | 4 ++-- k8s-training/locals.tf | 4 ++-- k8s-training/main.tf | 2 +- k8s-training/terraform.tfvars | 10 +++++----- wireguard/terraform.tfvars | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/k8s-inference/locals.tf b/k8s-inference/locals.tf index b1c34f2c..4edf97ef 100644 --- a/k8s-inference/locals.tf +++ b/k8s-inference/locals.tf @@ -2,7 +2,7 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) - + regions_default = { eu-west1 = { cpu_nodes_platform = "cpu-d3" diff --git a/k8s-inference/main.tf b/k8s-inference/main.tf index c2595ef8..76f605c8 100644 --- a/k8s-inference/main.tf +++ b/k8s-inference/main.tf @@ -68,7 +68,7 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } network_interfaces = [ { - subnet_id = var.subnet_id + subnet_id = var.subnet_id public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index c38651d3..b9509541 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -9,8 +9,8 @@ # } # K8s nodes -cpu_nodes_count = 1 # Number of CPU nodes -gpu_nodes_count = 1 # Number of GPU nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes # cpu_nodes_platform = # CPU nodes platofm # cpu_nodes_preset = # CPU nodes preset # gpu_nodes_platform = # GPU nodes platform diff --git a/k8s-training/locals.tf b/k8s-training/locals.tf index 32dd99c5..940172f7 100644 --- a/k8s-training/locals.tf +++ b/k8s-training/locals.tf @@ -9,14 +9,14 @@ locals { cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h200-sxm" gpu_nodes_preset = "1gpu-16vcpu-200gb" - infiniband_fabric = "fabric-5" + infiniband_fabric = "fabric-5" } eu-north1 = { cpu_nodes_platform = "cpu-e2" cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h100-sxm" gpu_nodes_preset = "1gpu-16vcpu-200gb" - infiniband_fabric = "fabric-3" + infiniband_fabric = "fabric-3" } } diff --git a/k8s-training/main.tf b/k8s-training/main.tf index a5b27e05..869a1b72 100644 --- a/k8s-training/main.tf +++ b/k8s-training/main.tf @@ -68,7 +68,7 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } network_interfaces = [ { - subnet_id = var.subnet_id + subnet_id = var.subnet_id public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index 8d650ae6..bf2a7f7f 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -9,8 +9,8 @@ # } # K8s nodes -cpu_nodes_count = 1 # Number of CPU nodes -gpu_nodes_count = 1 # Number of GPU nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes # cpu_nodes_platform = # CPU nodes platofm # cpu_nodes_preset = # CPU nodes preset # gpu_nodes_platform = # GPU nodes platform @@ -19,10 +19,10 @@ gpu_nodes_count = 1 # Number of GPU nodes # Observability -enable_grafana = true # Enable or disable Grafana deployment with true or false -enable_prometheus = true # Enable or disable Prometheus deployment with true or false +enable_grafana = true # Enable or disable Grafana deployment with true or false +enable_prometheus = true # Enable or disable Prometheus deployment with true or false enable_loki = false # Enable or disable Loki deployment with true or false -enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false +enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false ## Loki # loki_access_key_id = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki. diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 76deadb8..24c95c32 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -8,4 +8,4 @@ # public_ip_allocation_id = "" platform = cpu-e2 -preset = "4vcpu-16gb" \ No newline at end of file +preset = "4vcpu-16gb" \ No newline at end of file From 3f3ce80bfe85c327e80f6de9389e3535eb472ea7 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:50:58 +0200 Subject: [PATCH 19/32] Tests fixed; --- k8s-inference/tests/main.tftest.hcl | 3 +++ k8s-training/tests/k8s-training-kuberay.tftest.hcl | 3 +++ k8s-training/tests/main.tftest.hcl | 2 ++ 3 files changed, 8 insertions(+) diff --git a/k8s-inference/tests/main.tftest.hcl b/k8s-inference/tests/main.tftest.hcl index 040a2316..b9af681f 100644 --- a/k8s-inference/tests/main.tftest.hcl +++ b/k8s-inference/tests/main.tftest.hcl @@ -6,6 +6,7 @@ run "k8s_inference_apply" { ] } variables { + region = "eu-north1" etcd_cluster_size = 1 } } @@ -26,6 +27,7 @@ run "k8s_node_groups_inference_apply" { run "full_inference_apply" { command = apply variables { + region = "eu-north1" etcd_cluster_size = 1 } } @@ -34,6 +36,7 @@ run "test_mode_k8s_inference_apply" { command = apply variables { + region = "eu-north1" etcd_cluster_size = 1 test_mode = true } diff --git a/k8s-training/tests/k8s-training-kuberay.tftest.hcl b/k8s-training/tests/k8s-training-kuberay.tftest.hcl index af0d21d1..288be471 100644 --- a/k8s-training/tests/k8s-training-kuberay.tftest.hcl +++ b/k8s-training/tests/k8s-training-kuberay.tftest.hcl @@ -6,6 +6,7 @@ run "k8s_training_kuberay_apply" { ] } variables { + region = "eu-north1" etcd_cluster_size = 1 } } @@ -19,6 +20,7 @@ run "k8s_node_groups_training_kuberay_apply" { ] } variables { + region = "eu-north1" etcd_cluster_size = 1 } } @@ -27,6 +29,7 @@ run "full_training_kuberay_apply" { command = apply variables { + region = "eu-north1" etcd_cluster_size = 1 enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket enable_kuberay = true diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index 1f204bd3..c77d108c 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -21,6 +21,7 @@ run "full_training_apply" { command = apply variables { + region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket } } @@ -29,6 +30,7 @@ run "test_mode_k8s_training_apply" { command = apply variables { + region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket test_mode = true } From d5512e22c2c0aa434d935e484509d83890a4cbc8 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 15:53:12 +0200 Subject: [PATCH 20/32] Tf fmt; --- nfs-server/locals.tf | 2 +- nfs-server/variables.tf | 2 +- slurm/tests/main.tftest.hcl | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/nfs-server/locals.tf b/nfs-server/locals.tf index 2036a642..cba34bbe 100644 --- a/nfs-server/locals.tf +++ b/nfs-server/locals.tf @@ -2,7 +2,7 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) - regions_default = { + regions_default = { eu-west1 = { cpu_nodes_platform = "cpu-d3" cpu_nodes_preset = "16vcpu-64gb" diff --git a/nfs-server/variables.tf b/nfs-server/variables.tf index d04c7ed3..a83f100f 100644 --- a/nfs-server/variables.tf +++ b/nfs-server/variables.tf @@ -9,7 +9,7 @@ variable "subnet_id" { } variable "region" { - type = string + type = string description = "Project region." } diff --git a/slurm/tests/main.tftest.hcl b/slurm/tests/main.tftest.hcl index 6847e79c..fac982af 100644 --- a/slurm/tests/main.tftest.hcl +++ b/slurm/tests/main.tftest.hcl @@ -2,6 +2,7 @@ run "slurm_master_apply" { command = apply variables { + region = "eu-north1" cluster_workers_count = 2 } @@ -16,6 +17,7 @@ run "slurm_full_apply" { command = apply variables { + region = "eu-north1" cluster_workers_count = 2 } } @@ -24,6 +26,7 @@ run "test_mode_slurm_apply" { command = apply variables { + region = "eu-north1" cluster_workers_count = 2 test_mode = true } From 2991b9ec950c4d41c0f5210a0d7977418690e87e Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:08:30 +0200 Subject: [PATCH 21/32] Tests fixed (2); --- k8s-inference/tests/main.tftest.hcl | 1 + 1 file changed, 1 insertion(+) diff --git a/k8s-inference/tests/main.tftest.hcl b/k8s-inference/tests/main.tftest.hcl index b9af681f..df217ad4 100644 --- a/k8s-inference/tests/main.tftest.hcl +++ b/k8s-inference/tests/main.tftest.hcl @@ -20,6 +20,7 @@ run "k8s_node_groups_inference_apply" { ] } variables { + region = "eu-north1" etcd_cluster_size = 1 } } From 1948cca5343c6d01cc0c70837df4c7cf5f7c9370 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:16:37 +0200 Subject: [PATCH 22/32] Added "region" variable to control platform defaults (WireGuard); --- wireguard/locals.tf | 17 +++++++++++++++++ wireguard/terraform.tfvars | 12 +++++------- wireguard/tests/main.tftest.hcl | 1 + wireguard/variables.tf | 5 +++++ 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/wireguard/locals.tf b/wireguard/locals.tf index ed79b470..8c7a63f2 100644 --- a/wireguard/locals.tf +++ b/wireguard/locals.tf @@ -1,4 +1,21 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + platform = "cpu-d3" + preset = "16vcpu-64gb" + } + eu-north1 = { + platform = "cpu-e2" + preset = "16vcpu-64gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + platform = coalesce(var.platform, local.current_region_defaults.platform) + preset = coalesce(var.preset, local.current_region_defaults.preset) + } diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 24c95c32..2ca36082 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -1,11 +1,9 @@ -# parent_id = "" -# subnet_id = "" -# ssh_user_name = "ubuntu" +# parent_id = "" +# subnet_id = "" +# region = "eu-west1" +ssh_user_name = "ubuntu" # ssh_public_key = { # key = "put your public ssh key here" # path = "put path to ssh key here" # } -# public_ip_allocation_id = "" - -platform = cpu-e2 -preset = "4vcpu-16gb" \ No newline at end of file +# public_ip_allocation_id = "" \ No newline at end of file diff --git a/wireguard/tests/main.tftest.hcl b/wireguard/tests/main.tftest.hcl index f8ebc7af..f9d99353 100644 --- a/wireguard/tests/main.tftest.hcl +++ b/wireguard/tests/main.tftest.hcl @@ -6,6 +6,7 @@ run "test_mode_wireguard_apply" { command = apply variables { + region = "eu-north1" test_mode = true } } diff --git a/wireguard/variables.tf b/wireguard/variables.tf index e4d74af9..f05a5a33 100644 --- a/wireguard/variables.tf +++ b/wireguard/variables.tf @@ -9,6 +9,11 @@ variable "subnet_id" { type = string } +variable "region" { + description = "Project region." + type = string +} + # Platform variable "platform" { From 157adb92649c76f57ce7b0566669335a329f2747 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:16:42 +0200 Subject: [PATCH 23/32] Added "region" variable to control platform defaults (Slurm); --- slurm/locals.tf | 23 +++++++++++++++++++++++ slurm/nfs.tf | 2 ++ slurm/slurm-master.tf | 4 ++-- slurm/slurm-worker.tf | 4 ++-- slurm/terraform.tfvars | 8 ++------ slurm/variables.tf | 13 +++++++++---- 6 files changed, 40 insertions(+), 14 deletions(-) diff --git a/slurm/locals.tf b/slurm/locals.tf index ed79b470..1981d5b5 100644 --- a/slurm/locals.tf +++ b/slurm/locals.tf @@ -1,4 +1,27 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + + regions_default = { + eu-west1 = { + master_platform = "cpu-d3" + master_preset = "16vcpu-64gb" + worker_platform = "gpu-h200-sxm" + worker_preset = "1gpu-16vcpu-200gb" + } + eu-north1 = { + master_platform = "cpu-e2" + master_preset = "16vcpu-64gb" + worker_platform = "gpu-h100-sxm" + worker_preset = "1gpu-16vcpu-200gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + master_platform = coalesce(var.master_platform, local.current_region_defaults.master_platform) + master_preset = coalesce(var.master_preset, local.current_region_defaults.master_preset) + worker_platform = coalesce(var.worker_platform, local.current_region_defaults.worker_platform) + worker_preset = coalesce(var.worker_preset, local.current_region_defaults.worker_preset) } diff --git a/slurm/nfs.tf b/slurm/nfs.tf index 552ab197..d0fe16c8 100644 --- a/slurm/nfs.tf +++ b/slurm/nfs.tf @@ -10,4 +10,6 @@ module "nfs-module" { ssh_public_key = local.ssh_public_key nfs_ip_range = "192.168.0.0/16" nfs_size = var.fs_size + platform = local.master_platform + preset = local.master_preset } diff --git a/slurm/slurm-master.tf b/slurm/slurm-master.tf index cc34894f..dda58492 100644 --- a/slurm/slurm-master.tf +++ b/slurm/slurm-master.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "master" { name = "slurm-master" parent_id = var.parent_id resources = { - platform = var.master_platform - preset = var.master_preset + platform = local.master_platform + preset = local.master_preset } boot_disk = { attach_mode = "READ_WRITE" diff --git a/slurm/slurm-worker.tf b/slurm/slurm-worker.tf index 21317552..2bbfc0a6 100644 --- a/slurm/slurm-worker.tf +++ b/slurm/slurm-worker.tf @@ -29,8 +29,8 @@ resource "nebius_compute_v1_instance" "worker" { name = each.key parent_id = var.parent_id resources = { - platform = var.worker_platform - preset = var.worker_preset + platform = local.worker_platform + preset = local.worker_preset } gpu_cluster = nebius_compute_v1_gpu_cluster.gpu-cluster-slurm diff --git a/slurm/terraform.tfvars b/slurm/terraform.tfvars index 5db8a9f0..3d54369b 100644 --- a/slurm/terraform.tfvars +++ b/slurm/terraform.tfvars @@ -1,14 +1,10 @@ parent_id = "project-e00..." subnet_id = "vpcsubnet-e00..." +region = "" cluster_workers_count = 2 # amount of workers mysql_jobs_backend = false # Do you want to use mysql shared_fs_type = "filesystem" # "nfs" or "filesystem" # ssh_public_key = { # key = "put your public ssh key here" # path = "put path to ssh key here" -# } - -master_platform = "cpu-e2" -master_preset = "4vcpu-16gb" -worker_platform = "gpu-h100-sxm" -worker_preset = "8gpu-128vcpu-1600gb" \ No newline at end of file +# } \ No newline at end of file diff --git a/slurm/variables.tf b/slurm/variables.tf index 8a957eb7..f128c0d8 100644 --- a/slurm/variables.tf +++ b/slurm/variables.tf @@ -5,6 +5,11 @@ variable "subnet_id" { type = string } +variable "region" { + description = "Project region." + type = string +} + variable "ib_image_id" { type = string description = "ID of Infiniband image" @@ -32,25 +37,25 @@ variable "ssh_public_key" { variable "master_platform" { description = "Platform for Slurm Master." type = string - default = "cpu-e2" + default = null } variable "master_preset" { description = "Preset for Slurm Master." type = string - default = "4vcpu-16gb" + default = null } variable "worker_platform" { description = "Platform for Slurm Worker." type = string - default = "gpu-h100-sxm" + default = null } variable "worker_preset" { description = "Preset for Slurm Worker." type = string - default = "8gpu-128vcpu-1600gb" + default = null } variable "mysql_jobs_backend" { From 4a55aafd0916f2d95f2006433fdf7e70fbd85dd1 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:19:42 +0200 Subject: [PATCH 24/32] terraform.tfvars files refactored; --- k8s-inference/terraform.tfvars | 2 +- k8s-training/terraform.tfvars | 2 +- nfs-server/terraform.tfvars | 16 ++++++++-------- slurm/terraform.tfvars | 20 +++++++++++++------- wireguard/terraform.tfvars | 12 ++++++------ 5 files changed, 29 insertions(+), 23 deletions(-) diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index b9509541..35b76296 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -1,7 +1,7 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id -# region = "" +# region = "" # Project region # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index bf2a7f7f..f62dfe05 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -1,7 +1,7 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id -# region = "" +# region = "" # Project region # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR diff --git a/nfs-server/terraform.tfvars b/nfs-server/terraform.tfvars index d1f874e1..b9b7adf1 100644 --- a/nfs-server/terraform.tfvars +++ b/nfs-server/terraform.tfvars @@ -1,9 +1,9 @@ -parent_id = "project-..." -subnet_id = "vpcsubnet-..." -region = "eu-north1" -ssh_user_name = "nfs" -ssh_public_key = { - key = "put your ssh key here" - # path = "or put path to ssh key here" -} +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_public_key = { +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" +# } nfs_ip_range = "192.168.0.0/16" diff --git a/slurm/terraform.tfvars b/slurm/terraform.tfvars index 3d54369b..3c1ff8a4 100644 --- a/slurm/terraform.tfvars +++ b/slurm/terraform.tfvars @@ -1,10 +1,16 @@ -parent_id = "project-e00..." -subnet_id = "vpcsubnet-e00..." -region = "" +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_public_key = { +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" +# } cluster_workers_count = 2 # amount of workers mysql_jobs_backend = false # Do you want to use mysql shared_fs_type = "filesystem" # "nfs" or "filesystem" -# ssh_public_key = { -# key = "put your public ssh key here" -# path = "put path to ssh key here" -# } \ No newline at end of file + +# master_platform = +# master_preset = +# worker_platform = +# worker_preset = \ No newline at end of file diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 2ca36082..79c25e96 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -1,9 +1,9 @@ -# parent_id = "" -# subnet_id = "" -# region = "eu-west1" -ssh_user_name = "ubuntu" +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { -# key = "put your public ssh key here" -# path = "put path to ssh key here" +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" # } # public_ip_allocation_id = "" \ No newline at end of file From f4e08df7943912d3b9e19812e73ea252a5e0d5c1 Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 15:30:54 +0100 Subject: [PATCH 25/32] TF fmt --- k8s-training/tests/main.tftest.hcl | 4 ++-- slurm/tests/main.tftest.hcl | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index c77d108c..86026283 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -21,7 +21,7 @@ run "full_training_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket } } @@ -30,7 +30,7 @@ run "test_mode_k8s_training_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket test_mode = true } diff --git a/slurm/tests/main.tftest.hcl b/slurm/tests/main.tftest.hcl index fac982af..1f9367c9 100644 --- a/slurm/tests/main.tftest.hcl +++ b/slurm/tests/main.tftest.hcl @@ -2,7 +2,7 @@ run "slurm_master_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" cluster_workers_count = 2 } @@ -17,7 +17,7 @@ run "slurm_full_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" cluster_workers_count = 2 } } @@ -26,7 +26,7 @@ run "test_mode_slurm_apply" { command = apply variables { - region = "eu-north1" + region = "eu-north1" cluster_workers_count = 2 test_mode = true } From 5a18cda3f7842a0c4528a2ba4435a417c1bc14cd Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 15:35:57 +0100 Subject: [PATCH 26/32] Added region variables for tests --- k8s-training/tests/main.tftest.hcl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index 86026283..fb00360b 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -1,5 +1,11 @@ run "k8s_training_apply" { command = apply + + variables { + region = "eu-north1" + enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket + } + plan_options { target = [ nebius_mk8s_v1_cluster.k8s-cluster @@ -9,6 +15,12 @@ run "k8s_training_apply" { run "k8s_node_groups_training_apply" { command = apply + + variables { + region = "eu-north1" + enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket + } + plan_options { target = [ nebius_mk8s_v1_node_group.cpu-only, From 26de35561e4a770d1b40cf5ac20831d66326d72a Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 15:46:13 +0100 Subject: [PATCH 27/32] Clean region variable for tests --- .github/workflows/terraform.yml | 1 + k8s-inference/tests/main.tftest.hcl | 4 ---- k8s-training/tests/k8s-training-kuberay.tftest.hcl | 3 --- k8s-training/tests/main.tftest.hcl | 12 ------------ slurm/tests/main.tftest.hcl | 3 --- 5 files changed, 1 insertion(+), 22 deletions(-) diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index 25cf059d..24e8a75a 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -39,6 +39,7 @@ jobs: env: TF_VAR_subnet_id: vpcsubnet-e00dgdntmhgkeej1z3 + TF_VAR_region: eu-north1 TF_VAR_loki_access_key_id: ${{ secrets.SA_ACCESS_KEY_ID }} TF_VAR_loki_secret_key: ${{ secrets.SA_SECRET_KEY }} diff --git a/k8s-inference/tests/main.tftest.hcl b/k8s-inference/tests/main.tftest.hcl index df217ad4..040a2316 100644 --- a/k8s-inference/tests/main.tftest.hcl +++ b/k8s-inference/tests/main.tftest.hcl @@ -6,7 +6,6 @@ run "k8s_inference_apply" { ] } variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -20,7 +19,6 @@ run "k8s_node_groups_inference_apply" { ] } variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -28,7 +26,6 @@ run "k8s_node_groups_inference_apply" { run "full_inference_apply" { command = apply variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -37,7 +34,6 @@ run "test_mode_k8s_inference_apply" { command = apply variables { - region = "eu-north1" etcd_cluster_size = 1 test_mode = true } diff --git a/k8s-training/tests/k8s-training-kuberay.tftest.hcl b/k8s-training/tests/k8s-training-kuberay.tftest.hcl index 288be471..af0d21d1 100644 --- a/k8s-training/tests/k8s-training-kuberay.tftest.hcl +++ b/k8s-training/tests/k8s-training-kuberay.tftest.hcl @@ -6,7 +6,6 @@ run "k8s_training_kuberay_apply" { ] } variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -20,7 +19,6 @@ run "k8s_node_groups_training_kuberay_apply" { ] } variables { - region = "eu-north1" etcd_cluster_size = 1 } } @@ -29,7 +27,6 @@ run "full_training_kuberay_apply" { command = apply variables { - region = "eu-north1" etcd_cluster_size = 1 enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket enable_kuberay = true diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index fb00360b..14f99ed0 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -1,11 +1,6 @@ run "k8s_training_apply" { command = apply - variables { - region = "eu-north1" - enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket - } - plan_options { target = [ nebius_mk8s_v1_cluster.k8s-cluster @@ -16,11 +11,6 @@ run "k8s_training_apply" { run "k8s_node_groups_training_apply" { command = apply - variables { - region = "eu-north1" - enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket - } - plan_options { target = [ nebius_mk8s_v1_node_group.cpu-only, @@ -33,7 +23,6 @@ run "full_training_apply" { command = apply variables { - region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket } } @@ -42,7 +31,6 @@ run "test_mode_k8s_training_apply" { command = apply variables { - region = "eu-north1" enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket test_mode = true } diff --git a/slurm/tests/main.tftest.hcl b/slurm/tests/main.tftest.hcl index 1f9367c9..6847e79c 100644 --- a/slurm/tests/main.tftest.hcl +++ b/slurm/tests/main.tftest.hcl @@ -2,7 +2,6 @@ run "slurm_master_apply" { command = apply variables { - region = "eu-north1" cluster_workers_count = 2 } @@ -17,7 +16,6 @@ run "slurm_full_apply" { command = apply variables { - region = "eu-north1" cluster_workers_count = 2 } } @@ -26,7 +24,6 @@ run "test_mode_slurm_apply" { command = apply variables { - region = "eu-north1" cluster_workers_count = 2 test_mode = true } From 1f266852fc4e1f3dfedbdf237d70f6d7e5702bf4 Mon Sep 17 00:00:00 2001 From: Ilia Kargapolov Date: Thu, 21 Nov 2024 15:48:05 +0100 Subject: [PATCH 28/32] Clean region variable for tests --- k8s-training/tests/main.tftest.hcl | 2 -- wireguard/tests/main.tftest.hcl | 1 - 2 files changed, 3 deletions(-) diff --git a/k8s-training/tests/main.tftest.hcl b/k8s-training/tests/main.tftest.hcl index 14f99ed0..1f204bd3 100644 --- a/k8s-training/tests/main.tftest.hcl +++ b/k8s-training/tests/main.tftest.hcl @@ -1,6 +1,5 @@ run "k8s_training_apply" { command = apply - plan_options { target = [ nebius_mk8s_v1_cluster.k8s-cluster @@ -10,7 +9,6 @@ run "k8s_training_apply" { run "k8s_node_groups_training_apply" { command = apply - plan_options { target = [ nebius_mk8s_v1_node_group.cpu-only, diff --git a/wireguard/tests/main.tftest.hcl b/wireguard/tests/main.tftest.hcl index f9d99353..f8ebc7af 100644 --- a/wireguard/tests/main.tftest.hcl +++ b/wireguard/tests/main.tftest.hcl @@ -6,7 +6,6 @@ run "test_mode_wireguard_apply" { command = apply variables { - region = "eu-north1" test_mode = true } } From 8d281b855ecba0c15d4fad5bdcc4e7985b690743 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 16:57:09 +0200 Subject: [PATCH 29/32] Added "region" variable to control platform defaults (GlusterFS (2)); --- modules/gluster-module/instances.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/gluster-module/instances.tf b/modules/gluster-module/instances.tf index 3c3450ea..ea057e77 100644 --- a/modules/gluster-module/instances.tf +++ b/modules/gluster-module/instances.tf @@ -14,8 +14,8 @@ resource "nebius_compute_v1_instance" "gluster-fs-instance" { } ] resources = { - platform = "cpu-e2" - preset = "16vcpu-64gb" + platform = var.platform + preset = var.preset } boot_disk = { From b7f9b4117bd8e383b2f732dbefe0440b9c986077 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 17:21:37 +0200 Subject: [PATCH 30/32] Presets fixed; --- k8s-training/locals.tf | 4 ++-- slurm/locals.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/k8s-training/locals.tf b/k8s-training/locals.tf index 940172f7..165efd27 100644 --- a/k8s-training/locals.tf +++ b/k8s-training/locals.tf @@ -8,14 +8,14 @@ locals { cpu_nodes_platform = "cpu-d3" cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h200-sxm" - gpu_nodes_preset = "1gpu-16vcpu-200gb" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" infiniband_fabric = "fabric-5" } eu-north1 = { cpu_nodes_platform = "cpu-e2" cpu_nodes_preset = "16vcpu-64gb" gpu_nodes_platform = "gpu-h100-sxm" - gpu_nodes_preset = "1gpu-16vcpu-200gb" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" infiniband_fabric = "fabric-3" } } diff --git a/slurm/locals.tf b/slurm/locals.tf index 1981d5b5..90fe2b56 100644 --- a/slurm/locals.tf +++ b/slurm/locals.tf @@ -8,13 +8,13 @@ locals { master_platform = "cpu-d3" master_preset = "16vcpu-64gb" worker_platform = "gpu-h200-sxm" - worker_preset = "1gpu-16vcpu-200gb" + worker_preset = "8gpu-128vcpu-1600gb" } eu-north1 = { master_platform = "cpu-e2" master_preset = "16vcpu-64gb" worker_platform = "gpu-h100-sxm" - worker_preset = "1gpu-16vcpu-200gb" + worker_preset = "8gpu-128vcpu-1600gb" } } From 398051a88cb78ce5314eb5e8e876701f6bac5960 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 18:25:27 +0200 Subject: [PATCH 31/32] Added "region" variable to control platform defaults (WireGuard (2)); --- wireguard/main.tf | 4 ++-- wireguard/variables.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/wireguard/main.tf b/wireguard/main.tf index ebc8e482..9b3abc99 100644 --- a/wireguard/main.tf +++ b/wireguard/main.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "wireguard_instance" { ] resources = { - platform = var.platform - preset = var.preset + platform = local.platform + preset = local.preset } diff --git a/wireguard/variables.tf b/wireguard/variables.tf index f05a5a33..1b0d7c9b 100644 --- a/wireguard/variables.tf +++ b/wireguard/variables.tf @@ -19,13 +19,13 @@ variable "region" { variable "platform" { description = "Platform for WireGuard host." type = string - default = "cpu-e2" + default = null } variable "preset" { description = "Preset for WireGuard host." type = string - default = "4vcpu-16gb" + default = null } # SSH access From 3a0ab6b18b924f0e204b033aa78b3a89f50eebd1 Mon Sep 17 00:00:00 2001 From: Ilya Kelim Date: Thu, 21 Nov 2024 18:27:26 +0200 Subject: [PATCH 32/32] Tests fixed (3); --- k8s-training/applications.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s-training/applications.tf b/k8s-training/applications.tf index 3e84067a..d48326c5 100644 --- a/k8s-training/applications.tf +++ b/k8s-training/applications.tf @@ -12,8 +12,8 @@ module "kuberay" { parent_id = var.parent_id cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id - gpu_platform = var.gpu_nodes_platform - cpu_platform = var.cpu_nodes_platform + gpu_platform = local.gpu_nodes_platform + cpu_platform = local.cpu_nodes_platform min_gpu_replicas = var.kuberay_min_gpu_replicas max_gpu_replicas = var.kuberay_max_gpu_replicas }