diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index 25cf059d..24e8a75a 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -39,6 +39,7 @@ jobs: env: TF_VAR_subnet_id: vpcsubnet-e00dgdntmhgkeej1z3 + TF_VAR_region: eu-north1 TF_VAR_loki_access_key_id: ${{ secrets.SA_ACCESS_KEY_ID }} TF_VAR_loki_secret_key: ${{ secrets.SA_SECRET_KEY }} diff --git a/k8s-inference/README.md b/k8s-inference/README.md index 020f9941..42bfb756 100644 --- a/k8s-inference/README.md +++ b/k8s-inference/README.md @@ -75,6 +75,7 @@ There are additional configurable variables in `variables.tf`. # Cloud environment and network parent_id = "" # The project-id in this context subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +region = "" # The project region. ssh_user_name = "" # Username you want to use to connect to the nodes ssh_public_key = { key = "put your public ssh key here" OR @@ -266,13 +267,13 @@ apiVersion: v1 metadata: name: external-storage-persistent-volume spec: - storageClassName: hostpath + storageClassName: csi-mounted-fs-path-sc capacity: storage: "" accessModes: - ReadWriteMany hostPath: - path: "" # "/mnt/filestore/" or "/mnt/glusterfs/" + path: "" # "/mnt/data/" or "/mnt/glusterfs/" --- @@ -281,10 +282,21 @@ apiVersion: v1 metadata: name: external-storage-persistent-volumeclaim spec: - storageClassName: hostpath + storageClassName: csi-mounted-fs-path-sc accessModes: - ReadWriteMany resources: requests: storage: "" ``` + +## CSI limitations: +- FS should be mounted to all NodeGroups, because PV attachmend to pod runniing on Node without FS will fail +- One PV may fill up to all common FS size +- FS size will not be autoupdated if PV size exceed it spec size +- FS size for now can't be updated through API, only through NEBOPS. (thread) +- volumeMode: Block - is not possible + +## Good to know: +- read-write many mode PV will work +- MSP started testing that solution to enable early integration with mk8s. diff --git a/k8s-inference/gluster-fs.tf b/k8s-inference/gluster-fs.tf index 6a87b0e8..8d800f53 100644 --- a/k8s-inference/gluster-fs.tf +++ b/k8s-inference/gluster-fs.tf @@ -7,4 +7,6 @@ module "glusterfs" { disk_count_per_vm = var.glusterfs_disk_count_per_vm disk_size = var.glusterfs_disk_size ssh_public_key = local.ssh_public_key + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/k8s-inference/helm.tf b/k8s-inference/helm.tf index 283ea517..1e09c23b 100644 --- a/k8s-inference/helm.tf +++ b/k8s-inference/helm.tf @@ -30,7 +30,7 @@ module "o11y" { enabled = var.enable_dcgm, node_groups = { node_group_name = { - gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0]) + gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0]) instance_group_id = nebius_mk8s_v1_node_group.gpu.id } } @@ -39,3 +39,8 @@ module "o11y" { } test_mode = var.test_mode } + +module "csi-mounted-fs-path" { + source = "../modules/csi-mounted-fs-path" + count = var.enable_filestore ? 1 : 0 +} diff --git a/k8s-inference/locals.tf b/k8s-inference/locals.tf index a4fb0a61..4edf97ef 100644 --- a/k8s-inference/locals.tf +++ b/k8s-inference/locals.tf @@ -2,6 +2,28 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "1gpu-16vcpu-200gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset) } resource "random_string" "random" { diff --git a/k8s-inference/main.tf b/k8s-inference/main.tf index 0f9942ef..76f605c8 100644 --- a/k8s-inference/main.tf +++ b/k8s-inference/main.tf @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" { } ] resources = { - platform = var.cpu_nodes_platform - preset = var.cpu_nodes_preset + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } filesystems = var.enable_filestore ? [ { @@ -68,13 +68,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } network_interfaces = [ { - subnet_id = var.subnet_id - public_ip = var.gpu_nodes_assign_public_ip ? {} : null + subnet_id = var.subnet_id + public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] resources = { - platform = var.gpu_nodes_platform - preset = var.gpu_nodes_preset + platform = local.gpu_nodes_platform + preset = local.gpu_nodes_preset } filesystems = var.enable_filestore ? [ { diff --git a/k8s-inference/terraform.tfvars b/k8s-inference/terraform.tfvars index 7ff9869d..35b76296 100644 --- a/k8s-inference/terraform.tfvars +++ b/k8s-inference/terraform.tfvars @@ -1,17 +1,20 @@ # Cloud environment and network -# parent_id = "" # The project-id in this context -# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id -# ssh_user_name = "" # Username you want to use to connect to the nodes +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" # } -# K8s modes -cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset -gpu_nodes_count = 1 # Number of GPU nodes -gpu_nodes_preset = "1gpu-16vcpu-200gb" # The GPU node preset. Set to "8gpu-128vcpu-1600gb", to deploy nodes with 8 GPUs. +# K8s nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes +# cpu_nodes_platform = # CPU nodes platofm +# cpu_nodes_preset = # CPU nodes preset +# gpu_nodes_platform = # GPU nodes platform +# gpu_nodes_preset = # GPU nodes preset # Observability enable_grafana = true # Enable or disable Grafana deployment with true or false diff --git a/k8s-inference/variables.tf b/k8s-inference/variables.tf index ec1a994d..1087a015 100644 --- a/k8s-inference/variables.tf +++ b/k8s-inference/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,12 @@ variable "subnet_id" { type = string } +variable "region" { + description = "The current region." + type = string +} + +# K8s cluster variable "k8s_version" { description = "Kubernetes version to be used in the cluster." type = string @@ -114,13 +120,13 @@ variable "cpu_nodes_count" { variable "cpu_nodes_platform" { description = "Platform for nodes in the CPU-only node group." type = string - default = "cpu-e2" + default = null } variable "cpu_nodes_preset" { description = "CPU and RAM configuration for nodes in the CPU-only node group." type = string - default = "16vcpu-64gb" + default = null } variable "cpu_disk_type" { @@ -145,13 +151,13 @@ variable "gpu_nodes_count" { variable "gpu_nodes_platform" { description = "Platform for nodes in the GPU node group." type = string - default = "gpu-h100-sxm" + default = null } variable "gpu_nodes_preset" { description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group." type = string - default = "1gpu-16vcpu-200gb" + default = null } variable "gpu_disk_type" { diff --git a/k8s-training/README.md b/k8s-training/README.md index 711f73dc..1c62e18a 100644 --- a/k8s-training/README.md +++ b/k8s-training/README.md @@ -3,8 +3,11 @@ ## Features - Creating a Kubernetes cluster with CPU and GPU nodes. -- Installing the necessary [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) and [Network Operator](https://docs.nvidia.com/networking/display/cokan10/network+operator) for running GPU workloads. -- Installing [Grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana). + +- Installing the required [Nvidia Gpu Operator](https://github.com/NVIDIA/gpu-operator) + and [Network Operator](https://docs.nvidia.com/networking/display/cokan10/network+operator) for running GPU + workloads.- Installing [Grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana). + - Installing [Prometheus](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus). - Installing [Loki](https://github.com/grafana/loki/tree/main/production/helm/loki). - Installing [Promtail](https://github.com/grafana/helm-charts/tree/main/charts/promtail). @@ -28,6 +31,7 @@ source ~/.bashrc ``` + 3. [Configure Nebius CLI](https://docs.nebius.com/cli/configure/) (it is recommended to use [service account](https://docs.nebius.com/iam/service-accounts/manage/) for configuration) 4. Install JQuery: @@ -40,6 +44,7 @@ sudo apt install jq -y ``` + ## Usage To deploy a Kubernetes cluster, follow these steps: @@ -52,7 +57,11 @@ To deploy a Kubernetes cluster, follow these steps: ```bash terraform init ``` -3. Replace the placeholder content in `terraform.tfvars` with configuration values that meet your specific requirements. See the details [below](#configuration-variables). + +3. Replace the placeholder content + in `terraform.tfvars` with configuration values that meet your specific + requirements. See the details [below](#configuration-variables). + 4. Preview the deployment plan: ```bash terraform plan @@ -65,86 +74,103 @@ To deploy a Kubernetes cluster, follow these steps: ## Configuration variables -These are the basic configurations needed to deploy Kubernetes for Training in Nebius AI. Edit in the configurations that you need in the file `terraform.tfvars`. +These are the basic configurations required to deploy Kubernetes for training in Nebius AI. Edit the configurations as necessary in the `terraform.tfvars` file. -There are additional configurable variables in `variables.tf`. +Additional configurable variables can be found in the `variables.tf` file. ### Environment and network variables + ```hcl # Cloud environment and network parent_id = "" # The project-id in this context -subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +subnet_id = "" # Run the `nebius vpc v1alpha1 network list` command to see the subnet id +region = "" # The project region ssh_user_name = "" # Username you want to use to connect to the nodes ssh_public_key = { - key = "put your public ssh key here" OR - path = "put path to ssh key here" + key = "Enter your public SSH key here" OR + path = "Enter the path to your SSH key here" } ``` ### Kubernetes nodes + ```hcl # K8s modes cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset +cpu_nodes_preset = "16vcpu-64gb" # CPU node preset gpu_nodes_count = 1 # Number of GPU nodes + gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection. + ``` ### Observability options + ```hcl # Observability enable_grafana = true # Enable or disable Grafana deployment with true or false enable_prometheus = true # Enable or disable Prometheus deployment with true or false enable_loki = true # Enable or disable Loki deployment with true or false -enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false +enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment using true or false ## Loki -loki_access_key_id = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki. -loki_secret_key = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki. +loki_access_key_id = "" # See README.md for instructions. Leave empty if you are not deploying Loki. +loki_secret_key = "" # See the instruction in README.md on how to create this. If you are not deploying Loki, leave it empty. ``` -Check the details below for more information on [Grafana](#grafana), [Prometheus](#prometheus), [Loki](#temporary-block-to-make-loki-work-now) and [NVIDIA DCGM](#nvidia-dcgm-exporter-dashboard-and-alerting). +See the details below for more information on [Grafana](#grafana), [Prometheus](#prometheus), [Loki](#temporary-block-to-make-loki-work-now) and [NVIDIA DCGM](#nvidia-dcgm-exporter-dashboard-and-alerting). -> Deploying Loki will require you to create a service account! Please check the instructions [here](#temporary-block-to-make-loki-work-now)! +> To deploy Loki, you will need to create a service account. See the instructions [here](#temporary-block-to-make-loki-work-now). ### Storage configuration + ```hcl # Storage ## Filestore - recommended enable_filestore = true # Enable or disable Filestore integration with true or false -filestore_disk_size = 100 * (1024 * 1024 * 1024) #Set Filestore disk size in bytes. The multiplication makes it easier to set the size in GB. This would set the size as 100GB -filestore_block_size = 4096 # Set Filestore block size in bytes +filestore_disk_size = 100 * (1024 * 1024 * 1024) #Set the Filestore disk size in bytes. The multiplication makes it easier to set the size in GB, giving you a total of 100 GB +filestore_block_size = 4096 # Set the Filestore block size in bytes ## GlusterFS - legacy enable_glusterfs = false # Enable or disable GlusterFS integration with true or false -glusterfs_storage_nodes = 3 # Set amount of storage nodes in GlusterFS cluster -glusterfs_disk_count_per_vm = 2 # Set amount of disks per storage node in GlusterFS cluster -glusterfs_disk_size = 100 * (1024 * 1024 * 1024) #Set disk size in bytes. The multiplication makes it easier to set the size in GB. This would set the size as 100GB +glusterfs_storage_nodes = 3 # Set the number of storage nodes in the GlusterFS cluster +glusterfs_disk_count_per_vm = 2 # Set the number of disks per storage node in the GlusterFS cluster +glusterfs_disk_size = 100 * (1024 * 1024 * 1024) #Set the disk size in bytes. The multiplication makes it easier to set the size in GB, giving you a total of 100 GB. ``` -There are two options available for adding external storage to k8s clusters: +There are two ways to add external storage to K8s clusters: - Filestore (recommended, enabled by default) - GlusterFS (legacy) -Both would allow creating a Read-Write-Many HostPath PVCs in k8s cluster. Path for Filestore is `/mnt/filestore`, for -GlusterFS it is `/mnt/glusterfs`. +Both options allow you to create a Read-Write-Many HostPath PVCs in a K8s cluster. Use the following paths: `/mnt/filestore` for Filestore, `/mnt/glusterfs` for +GlusterFS. -Check [here](#accessing-storage) how to access storage in K8S. +For more information on how to access storage in K8s, refer [here](#accessing-storage). ## Connecting to the cluster -### Prepare your environment -* Install kubectl ([instructions](https://kubernetes.io/docs/tasks/tools/#kubectl)) -* Install Nebius AI CLI ([instructions](https://docs.nebius.ai/cli/install)) - also required for deploying the cluster -* Install JQ ([instructions](https://jqlang.github.io/jq/download/)) - also required for deploying the cluster +### Preparing the environment + +- Install kubectl ([instructions](https://kubernetes.io/docs/tasks/tools/#kubectl)) +- Install the Nebius AI CLI ([instructions](https://docs.nebius.ai/cli/install)) +- Install jq ([instructions](https://jqlang.github.io/jq/download/)) + +### Adding credentials to the kubectl configuration file + +1. Perform the following command from the terraform deployment folder: + +```bash +nebius mk8s v1 cluster get-credentials --id $(cat terraform.tfstate | jq -r '.resources[] | select(.type == "nebius_mk8s_v1_cluster") | .instances[].attributes.id') --external +``` + ### Add credentials to the kubectl configuration file 1. Run the following command from the terraform deployment folder: ```bash nebius mk8s v1 cluster get-credentials --id $(cat terraform.tfstate | jq -r '.resources[] | select(.type == "nebius_mk8s_v1_cluster") | .instances[].attributes.id') --external ``` -2. Add the credentials and verify the kubectl configuration: +2. Verify the kubectl configuration after adding the credentials: ```bash kubectl config view @@ -161,14 +187,16 @@ Check [here](#accessing-storage) how to access storage in K8S. ### Connect to the cluster Show cluster information: - ```bash - kubectl cluster-info - ``` + +```bash +kubectl cluster-info +``` Get pods: - ```bash - kubectl get pods -A - ``` + +```bash +kubectl get pods -A +``` ## Observability @@ -180,7 +208,8 @@ Observability stack is enabled by default. It includes the following components: ### Grafana -Can be disabled by setting the `enable_grafana` variable to `false` in `the terraform.tfvars` file. +To disable it, set the `enable_grafana` variable to `false` in the `terraform.tfvars` file. + To access Grafana: @@ -189,7 +218,9 @@ To access Grafana: kubectl --namespace o11y port-forward service/grafana 8080:80 ``` -2. **Access Grafana dashboard:** Open your browser and go to `http://localhost:8080`. + +2. **Access the Grafana dashboard:** Open your browser and go to `http://localhost:8080`. + 3. **Log in:** Use the default credentials to log in: - **Username:** `admin` @@ -197,66 +228,73 @@ To access Grafana: ### Log aggregation -#### Temporary block to make Loki work now - -1. Create an SA - 2. `nebius iam service-account create --parent-id --name `. -2. Add an SA to the editors group. - 3. Get your tenant id with `nebius iam whoami`. - 4. Get the `editors` group id with: `nebius iam group list --parent-id | grep -n5 "name: editors"`. - 3. List all members of the `editors` group - with `nebius iam group-membership list-members --parent-id `. - 4. Add your SA to the `editors` group - with `nebius iam group-membership create --parent-id --member-id ` -3. Create access key and get its credentials: - 4. `nebius iam access-key create --account-service-account-id --description 'AWS CLI' --format json` - 5. `nebius iam access-key get-by-aws-id --aws-access-key-id --view secret --format json` -4. Update `loki_access_key_id` and `loki_secret_key` in `terraform.tfvars` with info from the last command. - -Log aggregation with the Loki is enabled by default. To disable it, set the `enable_loki` variable to `false` in the +#### Create a temporary block to enable Loki + + +1. Create a SA \ + `nebius iam service-account create --parent-id --name `. +2. Add an SA to editors group. \ + Get your tenant id using `nebius iam whoami`. \ + Get the `editors` group id using `nebius iam group list --parent-id | grep -n5 "name: editors"`. \ + + List all members of the `editors` group + with `nebius iam group-membership list-members --parent-id `. \ + Add your SA to the `editors` group + with `nebius iam group-membership create --parent-id --member-id ` \ +3. Create access key and get its credentials: \ + `nebius iam access-key create --account-service-account-id --description 'AWS CLI' --format json` \ + `nebius iam access-key get-by-aws-id --aws-access-key-id --view secret --format json` \ + +4. Update `loki_access_key_id` and `loki_secret_key` in `terraform.tfvars` with the result of the previous command. + +Log aggregation with Loki is enabled by default. If you want to disable it, set the `enable_loki` variable to `false` in the `terraform.tfvars` file. -To access logs navigate to Loki dashboard `http://localhost:8080/d/o6-BGgnnk/loki-kubernetes-logs` +To access logs, go to the Loki dashboard `http://localhost:8080/d/o6-BGgnnk/loki-kubernetes-logs`. -**NB!** You would have to manually clean loki bucket before doing `terraform destroy` +**NB!** You will have to manually clean the Loki bucket before performing the `terraform destroy` command. ### Prometheus -Prometheus server is enabled by default. To disable it, set the `enable_prometheus` variable to `false` in the -`terraform.tfvars` file. -Because `DCGM exporter` uses Prometheus as a datasource it will be disabled as well. -To access logs navigate to Node exporter folder `http://localhost:8080/f/e6acfbcb-6f13-4a58-8e02-f780811a2404/` +Prometheus server is enabled by default. If you want to disable it, set the `enable_prometheus` variable to `false` in the `terraform.tfvars` file. +Because `DCGM exporter` uses Prometheus as a data source it will also be disabled. + + +To access logs, go to the Node exporter folder `http://localhost:8080/f/e6acfbcb-6f13-4a58-8e02-f780811a2404/` ### NVIDIA DCGM Exporter Dashboard and Alerting -NVIDIA DCGM Exporter Dashboard and Alerting rules are enabled by default. To disable it, set the `enable_dcgm` -variable to `false` in the `terraform.tfvars` file. -By default Alerting rules are created for node groups that has GPUs. +NVIDIA DCGM Exporter Dashboard and Alerting rules are enabled by default. If you need to disable it, set the `enable_dcgm` variable to `false` in terraform.tfvars\` file. + -To access NVIDIA DCGM Exporter Dashboard `http://localhost:8080/d/Oxed_c6Wz/nvidia-dcgm-exporter-dashboard` + +Alerting rules are created for node groups with GPUs by default. + +To access the NVIDIA DCGM Exporter dashboard, go to `http://localhost:8080/d/Oxed_c6Wz/nvidia-dcgm-exporter-dashboard` ### Alerting -To enable alert messages for Slack please refer -this [article](https://grafana.com/docs/grafana/latest/alerting/configure-notifications/manage-contact-points/integrations/configure-slack/) +To enable alert messages for Slack, refer to this [article](https://grafana.com/docs/grafana/latest/alerting/configure-notifications/manage-contact-points/integrations/configure-slack/) ## Accessing storage ### Prerequisites: -1. To use csi-driver, it's mandatory to set 'enable_filestore = true' in terraform.tfvars file. -2. Then, the helm release managing this csi-driver is deployed in helm.tf file by applying the module: "csi-mounted-fs-path". -3. Keep in mind that 'csi-mounted-fs-path' module is applying only while instances are in boot process, using the following /nebius-solution-library/modules/cloud-init/k8s-cloud-init.tftpl commands: + +1. To use csi-driver, you must set 'enable_filestore = true' in the `terraform.tfvars` file. +2. Deploy the helm release that manages this csi-driver in the `helm.tf` file by applying the "csi-mounted-fs-path" module. +3. Keep in mind that the 'csi-mounted-fs-path' module can only be applied while instances are booting, using the following /nebius-solution-library/modules/cloud-init/k8s-cloud-init.tftpl commands: ```shell - sudo mkdir -p /mnt/data - sudo mount -t virtiofs data /mnt/data - echo data /mnt/data \"virtiofs\" \"defaults\" \"0\" \"2\" | sudo tee -a /etc/fstab" ``` -### Using mounted storageclass -Using mounted storage requires manually creating Persistent Volumes. Bellow is a template for creating PV and PVC. -Replace `` and `` variables with actual values. +### Using mounted StorageClass + +To use mounted storage, you need to manually create Persistent Volumes (PVs). Use the template below to create a PV and PVC. +Replace `` and `` variables with your specific values. ```yaml kind: PersistentVolume @@ -288,14 +326,14 @@ spec: ``` -CSI limitations: -limitations of CSI over mounted FS -FS should be mounted to all NodeGroups, because PV attachmend to pod runniing on Node without FS will fail -One PV may fill up to all common FS size -FS size will not be autoupdated if PV size exceed it spec size -FS size for now can't be updated through API, only through NEBOPS. (thread) -volumeMode: Block - is not possible +## CSI limitations: +- FS should be mounted to all NodeGroups, because PV attachmend to pod runniing on Node without FS will fail +- One PV may fill up to all common FS size +- FS size will not be autoupdated if PV size exceed it spec size +- FS size for now can't be updated through API, only through NEBOPS. (thread) +- volumeMode: Block - is not possible -Good to know: -read-write many mode PV will work -MSP started testing that solution to enable early integration with mk8s. Hope they will bring feedback soon. +## Good to know: +- read-write many mode PV will work +- MSP started testing that solution to enable early integration with mk8s. +======= diff --git a/k8s-training/applications.tf b/k8s-training/applications.tf index 3e84067a..d48326c5 100644 --- a/k8s-training/applications.tf +++ b/k8s-training/applications.tf @@ -12,8 +12,8 @@ module "kuberay" { parent_id = var.parent_id cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id - gpu_platform = var.gpu_nodes_platform - cpu_platform = var.cpu_nodes_platform + gpu_platform = local.gpu_nodes_platform + cpu_platform = local.cpu_nodes_platform min_gpu_replicas = var.kuberay_min_gpu_replicas max_gpu_replicas = var.kuberay_max_gpu_replicas } diff --git a/k8s-training/gluster-fs.tf b/k8s-training/gluster-fs.tf index 6a87b0e8..8d800f53 100644 --- a/k8s-training/gluster-fs.tf +++ b/k8s-training/gluster-fs.tf @@ -7,4 +7,6 @@ module "glusterfs" { disk_count_per_vm = var.glusterfs_disk_count_per_vm disk_size = var.glusterfs_disk_size ssh_public_key = local.ssh_public_key + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/k8s-training/gpu_cluster.tf b/k8s-training/gpu_cluster.tf index 89cce8c5..472d950f 100644 --- a/k8s-training/gpu_cluster.tf +++ b/k8s-training/gpu_cluster.tf @@ -1,5 +1,5 @@ resource "nebius_compute_v1_gpu_cluster" "fabric_2" { - infiniband_fabric = var.infiniband_fabric + infiniband_fabric = local.infiniband_fabric parent_id = var.parent_id - name = join("-", [var.infiniband_fabric, local.release-suffix]) + name = join("-", [local.infiniband_fabric, local.release-suffix]) } diff --git a/k8s-training/helm.tf b/k8s-training/helm.tf index 1bf3755f..6bceeef5 100644 --- a/k8s-training/helm.tf +++ b/k8s-training/helm.tf @@ -39,7 +39,7 @@ module "o11y" { enabled = var.enable_dcgm, node_groups = { node_group_name = { - gpus = tonumber(split("gpu-", var.gpu_nodes_preset)[0]) + gpus = tonumber(split("gpu-", local.gpu_nodes_preset)[0]) instance_group_id = nebius_mk8s_v1_node_group.gpu.id } } diff --git a/k8s-training/locals.tf b/k8s-training/locals.tf index a4fb0a61..165efd27 100644 --- a/k8s-training/locals.tf +++ b/k8s-training/locals.tf @@ -2,6 +2,31 @@ locals { release-suffix = random_string.random.result ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-5" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-3" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region_defaults.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region_defaults.gpu_nodes_preset) + infiniband_fabric = coalesce(var.infiniband_fabric, local.current_region_defaults.infiniband_fabric) } resource "random_string" "random" { diff --git a/k8s-training/main.tf b/k8s-training/main.tf index fa140d14..869a1b72 100644 --- a/k8s-training/main.tf +++ b/k8s-training/main.tf @@ -31,8 +31,8 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" { } ] resources = { - platform = var.cpu_nodes_platform - preset = var.cpu_nodes_preset + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } filesystems = var.enable_filestore ? [ { @@ -68,13 +68,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" { } network_interfaces = [ { - subnet_id = var.subnet_id - public_ip = var.gpu_nodes_assign_public_ip ? {} : null + subnet_id = var.subnet_id + public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null } ] resources = { - platform = var.gpu_nodes_platform - preset = var.gpu_nodes_preset + platform = local.gpu_nodes_platform + preset = local.gpu_nodes_preset } filesystems = var.enable_filestore ? [ { diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index 5392c93f..f62dfe05 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -1,24 +1,28 @@ # Cloud environment and network # parent_id = "" # The project-id in this context # subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region # ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { # key = "put your public ssh key here" OR # path = "put path to ssh key here" # } -# K8s modes -cpu_nodes_count = 1 # Number of CPU nodes -cpu_nodes_preset = "16vcpu-64gb" # The CPU node preset -gpu_nodes_count = 1 # Number of GPU nodes -gpu_nodes_preset = "8gpu-128vcpu-1600gb" # The GPU node preset. Only nodes with 8 GPU can be added to gpu cluster with infiniband connection +# K8s nodes +cpu_nodes_count = 1 # Number of CPU nodes +gpu_nodes_count = 1 # Number of GPU nodes +# cpu_nodes_platform = # CPU nodes platofm +# cpu_nodes_preset = # CPU nodes preset +# gpu_nodes_platform = # GPU nodes platform +# gpu_nodes_preset = # GPU nodes preset +# infiniband_fabric = # Infiniband fabric name. # Observability -enable_grafana = true # Enable or disable Grafana deployment with true or false -enable_prometheus = true # Enable or disable Prometheus deployment with true or false +enable_grafana = true # Enable or disable Grafana deployment with true or false +enable_prometheus = true # Enable or disable Prometheus deployment with true or false enable_loki = false # Enable or disable Loki deployment with true or false -enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false +enable_dcgm = true # Enable or disable NVIDIA DCGM Exporter Dashboard and Alerting deployment with true or false ## Loki # loki_access_key_id = "" # See the instruction in README.md on how to create this. Leave empty if you are not deploying Loki. diff --git a/k8s-training/variables.tf b/k8s-training/variables.tf index b17ba14b..65f6b71c 100644 --- a/k8s-training/variables.tf +++ b/k8s-training/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,12 @@ variable "subnet_id" { type = string } +variable "region" { + description = "The current region." + type = string +} + +# K8s cluster variable "k8s_version" { description = "Kubernetes version to be used in the cluster." type = string @@ -114,13 +120,13 @@ variable "cpu_nodes_count" { variable "cpu_nodes_platform" { description = "Platform for nodes in the CPU-only node group." type = string - default = "cpu-e2" + default = null } variable "cpu_nodes_preset" { description = "CPU and RAM configuration for nodes in the CPU-only node group." type = string - default = "16vcpu-64gb" + default = null } variable "cpu_disk_type" { @@ -145,13 +151,13 @@ variable "gpu_nodes_count" { variable "gpu_nodes_platform" { description = "Platform for nodes in the GPU node group." type = string - default = "gpu-h100-sxm" + default = null } variable "gpu_nodes_preset" { description = "Configuration for GPU amount, CPU, and RAM for nodes in the GPU node group." type = string - default = "8gpu-128vcpu-1600gb" + default = null } variable "gpu_disk_type" { @@ -169,7 +175,7 @@ variable "gpu_disk_size" { variable "infiniband_fabric" { description = "Infiniband's fabric name." type = string - default = "fabric-3" + default = null } variable "gpu_nodes_assign_public_ip" { diff --git a/modules/gluster-module/instances.tf b/modules/gluster-module/instances.tf index 3c3450ea..ea057e77 100644 --- a/modules/gluster-module/instances.tf +++ b/modules/gluster-module/instances.tf @@ -14,8 +14,8 @@ resource "nebius_compute_v1_instance" "gluster-fs-instance" { } ] resources = { - platform = "cpu-e2" - preset = "16vcpu-64gb" + platform = var.platform + preset = var.preset } boot_disk = { diff --git a/nfs-server/locals.tf b/nfs-server/locals.tf index ed79b470..cba34bbe 100644 --- a/nfs-server/locals.tf +++ b/nfs-server/locals.tf @@ -1,4 +1,20 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + cpu_nodes_platform = "cpu-d3" + cpu_nodes_preset = "16vcpu-64gb" + } + eu-north1 = { + cpu_nodes_platform = "cpu-e2" + cpu_nodes_preset = "16vcpu-64gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset) + cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform) } diff --git a/nfs-server/main.tf b/nfs-server/main.tf index 10d495ac..39085d14 100644 --- a/nfs-server/main.tf +++ b/nfs-server/main.tf @@ -9,4 +9,6 @@ module "nfs-module" { ssh_public_key = var.ssh_public_key.key nfs_ip_range = var.nfs_ip_range nfs_size = var.nfs_size + platform = local.cpu_nodes_platform + preset = local.cpu_nodes_preset } diff --git a/nfs-server/nfs.tfvars b/nfs-server/nfs.tfvars deleted file mode 100644 index 275cf081..00000000 --- a/nfs-server/nfs.tfvars +++ /dev/null @@ -1,8 +0,0 @@ -parent_id = "project-..." -subnet_id = "vpcsubnet-..." -ssh_user_name = "nfs" -ssh_public_key = { - key = "put your ssh key here" - # path = "or put path to ssh key here" -} -nfs_ip_range = "192.168.0.0/16" diff --git a/nfs-server/terraform.tfvars b/nfs-server/terraform.tfvars new file mode 100644 index 00000000..b9b7adf1 --- /dev/null +++ b/nfs-server/terraform.tfvars @@ -0,0 +1,9 @@ +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_public_key = { +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" +# } +nfs_ip_range = "192.168.0.0/16" diff --git a/nfs-server/variables.tf b/nfs-server/variables.tf index 369cefba..a83f100f 100644 --- a/nfs-server/variables.tf +++ b/nfs-server/variables.tf @@ -8,6 +8,23 @@ variable "subnet_id" { description = "ID of the subnet." } +variable "region" { + type = string + description = "Project region." +} + +variable "cpu_nodes_platform" { + description = "Platform for instances." + type = string + default = null +} + +variable "cpu_nodes_preset" { + description = "CPU and RAM configuration for instances." + type = string + default = null +} + variable "nfs_size" { type = number default = 93 * 1024 * 1024 * 1024 # size should be a multiple of 99857989632 diff --git a/slurm/locals.tf b/slurm/locals.tf index ed79b470..90fe2b56 100644 --- a/slurm/locals.tf +++ b/slurm/locals.tf @@ -1,4 +1,27 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + + regions_default = { + eu-west1 = { + master_platform = "cpu-d3" + master_preset = "16vcpu-64gb" + worker_platform = "gpu-h200-sxm" + worker_preset = "8gpu-128vcpu-1600gb" + } + eu-north1 = { + master_platform = "cpu-e2" + master_preset = "16vcpu-64gb" + worker_platform = "gpu-h100-sxm" + worker_preset = "8gpu-128vcpu-1600gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + master_platform = coalesce(var.master_platform, local.current_region_defaults.master_platform) + master_preset = coalesce(var.master_preset, local.current_region_defaults.master_preset) + worker_platform = coalesce(var.worker_platform, local.current_region_defaults.worker_platform) + worker_preset = coalesce(var.worker_preset, local.current_region_defaults.worker_preset) } diff --git a/slurm/nfs.tf b/slurm/nfs.tf index 552ab197..d0fe16c8 100644 --- a/slurm/nfs.tf +++ b/slurm/nfs.tf @@ -10,4 +10,6 @@ module "nfs-module" { ssh_public_key = local.ssh_public_key nfs_ip_range = "192.168.0.0/16" nfs_size = var.fs_size + platform = local.master_platform + preset = local.master_preset } diff --git a/slurm/slurm-master.tf b/slurm/slurm-master.tf index 26f3075b..dda58492 100644 --- a/slurm/slurm-master.tf +++ b/slurm/slurm-master.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "master" { name = "slurm-master" parent_id = var.parent_id resources = { - platform = "cpu-e2" - preset = "4vcpu-16gb" + platform = local.master_platform + preset = local.master_preset } boot_disk = { attach_mode = "READ_WRITE" diff --git a/slurm/slurm-worker.tf b/slurm/slurm-worker.tf index c1825c4f..2bbfc0a6 100644 --- a/slurm/slurm-worker.tf +++ b/slurm/slurm-worker.tf @@ -29,8 +29,8 @@ resource "nebius_compute_v1_instance" "worker" { name = each.key parent_id = var.parent_id resources = { - platform = "gpu-h100-sxm" - preset = "8gpu-128vcpu-1600gb" + platform = local.worker_platform + preset = local.worker_preset } gpu_cluster = nebius_compute_v1_gpu_cluster.gpu-cluster-slurm diff --git a/slurm/terraform.tfvars b/slurm/terraform.tfvars index 1858faef..3c1ff8a4 100644 --- a/slurm/terraform.tfvars +++ b/slurm/terraform.tfvars @@ -1,9 +1,16 @@ -parent_id = "project-e00..." -subnet_id = "vpcsubnet-e00..." +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes +# ssh_public_key = { +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" +# } cluster_workers_count = 2 # amount of workers mysql_jobs_backend = false # Do you want to use mysql shared_fs_type = "filesystem" # "nfs" or "filesystem" -# ssh_public_key = { -# key = "put your public ssh key here" -# path = "put path to ssh key here" -# } + +# master_platform = +# master_preset = +# worker_platform = +# worker_preset = \ No newline at end of file diff --git a/slurm/variables.tf b/slurm/variables.tf index 565a04fc..f128c0d8 100644 --- a/slurm/variables.tf +++ b/slurm/variables.tf @@ -5,6 +5,11 @@ variable "subnet_id" { type = string } +variable "region" { + description = "Project region." + type = string +} + variable "ib_image_id" { type = string description = "ID of Infiniband image" @@ -29,12 +34,29 @@ variable "ssh_public_key" { } } -variable "platform_id" { +variable "master_platform" { + description = "Platform for Slurm Master." + type = string + default = null +} + +variable "master_preset" { + description = "Preset for Slurm Master." type = string - description = "Platform for workers: gpu-h100-b for Inspur or gpu-h100 for Gigabyte" - default = "gpu-h100-b" + default = null } +variable "worker_platform" { + description = "Platform for Slurm Worker." + type = string + default = null +} + +variable "worker_preset" { + description = "Preset for Slurm Worker." + type = string + default = null +} variable "mysql_jobs_backend" { type = bool diff --git a/soperator/VERSION b/soperator/VERSION index 42cf0675..f2380cc7 100644 --- a/soperator/VERSION +++ b/soperator/VERSION @@ -1 +1 @@ -1.15.2 +1.15.3 diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index a6d3993a..59c57207 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -255,6 +255,7 @@ module "slurm" { nccl_benchmark_enable = var.nccl_benchmark_enable nccl_benchmark_schedule = var.nccl_benchmark_schedule nccl_benchmark_min_threshold = var.nccl_benchmark_min_threshold + nccl_use_infiniband = var.nccl_use_infiniband telemetry_enabled = var.telemetry_enabled telemetry_grafana_admin_password = var.telemetry_grafana_admin_password diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 408ac7b5..a740d787 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -168,7 +168,7 @@ slurm_cluster_name = "my-amazing-slurm" # Version of soperator. # --- -slurm_operator_version = "1.15.2" +slurm_operator_version = "1.15.3" # Type of the Slurm partition config. Could be either `default` or `custom`. # By default, "default". @@ -357,9 +357,14 @@ slurm_shared_memory_size_gibibytes = 256 # nccl_benchmark_enable = "0 */3 * * *" # Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable. -# By default, 420. +# By default, 45. # --- -# nccl_benchmark_min_threshold = 420 +# nccl_benchmark_min_threshold = 45 + +# Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test. +# By default, true +# --- +# nccl_use_infiniband = true # endregion NCCL benchmark diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index b6fa95f5..b1a9a480 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -504,10 +504,16 @@ variable "nccl_benchmark_schedule" { variable "nccl_benchmark_min_threshold" { description = "Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable." type = number - default = 420 + default = 45 } -# region NCCL benchmark +variable "nccl_use_infiniband" { + description = "Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test." + type = bool + default = true +} + +# endregion NCCL benchmark # region Telemetry diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile b/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile index 9d1f22ab..11377040 100644 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile @@ -121,6 +121,21 @@ RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \ RUN pip install huggingface_hub==0.23.2 RUN pip install -v "transformers<=4.40.2" +## Reinstall NCCL to the latest version +#RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +#RUN dpkg -i cuda-keyring_1.1-1_all.deb +#RUN apt-get update +#RUN apt install libnccl2=2.23.4-1+cuda12.4 libnccl-dev=2.23.4-1+cuda12.4 + +## Install NCCL profiler plugin +#RUN git clone https://github.com/NVIDIA/nccl && \ +# cd nccl && \ +# git checkout v2.23.4-1 && \ +# cd ext-profiler/example && \ +# make && \ +# cp libnccl-profiler.so /usr/lib/x86_64-linux-gnu/ + + # Benchmark code WORKDIR /workspace/llm diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION b/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION index 9c38d380..e2c2ff71 100644 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION @@ -1 +1 @@ -4.0-16 +4.0-20 diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh new file mode 100644 index 00000000..fb48cf75 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# DL params +export DGXNNODES="${DGXNNODES:=64}" # NODEx64 +export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=2}" # TPx2 (training.model.tensor_model_parallel_size) +export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) +export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 +export MINIBS="${MINIBS:=128}" # MINBSx128 +export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2 + +# Check DL params +# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 +# This simplifies to MINIBS % PP == 0 +if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then + echo "MINIBS should be divisble by PP" + exit 1 +fi + + + +# Slurm resource allocation +export SBATCH_GPUS_PER_NODE="8" +export SBATCH_MEM_PER_NODE="1200G" +export SBATCH_TRES_PER_TASK="cpu=16" +export SBATCH_DISTRIBUTION="block:block:block" +export SLURM_CPU_BIND="verbose,none" +#export EXCLUSIVE=1 + +# Use bindpcie CPU pinning +export ENABLE_CPU_EXCLUSIVE=1 +export ENABLE_IB_BINDING=1 + + + + +# Job time limit +export WALLTIME_MINUTES=1200 +export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) + + + +# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). +export TP_COMM_OVERLAP=True + +# Execute of nvidia-smi boost-slider --vboost +export VBOOST_VALUE=1 + +# Set MaxQ and MinEDP clocks +export SET_MAXQ_CLK=0 +export MAXQ_CLK="" +export SET_MINEDP_CLK=0 +export MINEDP_CLK="" + +# Set power limit +export SET_POWER_CAP=0 +export POWER_CAP="" + +# Use CPU offloading (activations & weights). +export CPU_OFFLOADING=False + +# Load the minimal number of samples +export LOAD_MINIMAL_NUM_SAMPLES=0 + +# Load distributed checkpoint directly on GPU +export LOAD_DIRECTLY_ON_DEVICE=0 + + + +# Extract system name +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) + + + +# Configure mlperf SYSJSON logging +export MLPERF_SUBMITTER="Nebius" +export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" +export MLPERF_STATUS="cloud" + + + +# Apply common settings +source $(dirname ${BASH_SOURCE[0]})/config_common.sh + +# Apply FP8 settings +source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh + diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh new file mode 120000 index 00000000..bbe6159f --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh @@ -0,0 +1 @@ +config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh \ No newline at end of file diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh new file mode 100644 index 00000000..b006bdd6 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# DL params +export DGXNNODES="${DGXNNODES:=8}" # NODEx8 +export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=4}" # TPx4 (training.model.tensor_model_parallel_size) +export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) +export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 +export MINIBS="${MINIBS:=128}" # MINBSx128 +export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2 + +# Check DL params +# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 +# This simplifies to MINIBS % PP == 0 +if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then + echo "MINIBS should be divisble by PP" + exit 1 +fi + + + +# Slurm resource allocation +export SBATCH_GPUS_PER_NODE="8" +export SBATCH_MEM_PER_NODE="1200G" +export SBATCH_TRES_PER_TASK="cpu=16" +export SBATCH_DISTRIBUTION="block:block:block" +export SLURM_CPU_BIND="verbose,none" +#export EXCLUSIVE=1 + +# Use bindpcie CPU pinning +export ENABLE_CPU_EXCLUSIVE=1 +export ENABLE_IB_BINDING=1 + + + + +# Job time limit +export WALLTIME_MINUTES=1200 +export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) + + + +# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). +export TP_COMM_OVERLAP=True + +# Execute of nvidia-smi boost-slider --vboost +export VBOOST_VALUE=1 + +# Set MaxQ and MinEDP clocks +export SET_MAXQ_CLK=0 +export MAXQ_CLK="" +export SET_MINEDP_CLK=0 +export MINEDP_CLK="" + +# Set power limit +export SET_POWER_CAP=0 +export POWER_CAP="" + +# Use CPU offloading (activations & weights). +export CPU_OFFLOADING=False + +# Load the minimal number of samples +export LOAD_MINIMAL_NUM_SAMPLES=0 + +# Load distributed checkpoint directly on GPU +export LOAD_DIRECTLY_ON_DEVICE=0 + + + +# Extract system name +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) + + + +# Configure mlperf SYSJSON logging +export MLPERF_SUBMITTER="Nebius" +export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" +export MLPERF_STATUS="cloud" + + + +# Apply common settings +source $(dirname ${BASH_SOURCE[0]})/config_common.sh + +# Apply FP8 settings +source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh + diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh new file mode 100644 index 00000000..c8109ba5 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# DL params +export DGXNNODES="${DGXNNODES:=8}" # NODEx8 +export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=8}" # TPx8 (training.model.tensor_model_parallel_size) +export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) +export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 +export MINIBS="${MINIBS:=3072}" # MINBSx3072 +export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=1}" # MICBSx1 + +# Check DL params +# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 +# This simplifies to MINIBS % PP == 0 +if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then + echo "MINIBS should be divisble by PP" + exit 1 +fi + + + +# Slurm resource allocation +export SBATCH_GPUS_PER_NODE="8" +export SBATCH_MEM_PER_NODE="1200G" +export SBATCH_TRES_PER_TASK="cpu=16" +export SBATCH_DISTRIBUTION="block:block:block" +export SLURM_CPU_BIND="verbose,none" +#export EXCLUSIVE=1 + +# Use bindpcie CPU pinning +export ENABLE_CPU_EXCLUSIVE=1 +export ENABLE_IB_BINDING=1 + + + + +# Job time limit +export WALLTIME_MINUTES=1200 +export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) + + + +# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). +export TP_COMM_OVERLAP=True + +# Execute of nvidia-smi boost-slider --vboost +export VBOOST_VALUE=1 + +# Set MaxQ and MinEDP clocks +export SET_MAXQ_CLK=0 +export MAXQ_CLK="" +export SET_MINEDP_CLK=0 +export MINEDP_CLK="" + +# Set power limit +export SET_POWER_CAP=0 +export POWER_CAP="" + +# Use CPU offloading (activations & weights). +export CPU_OFFLOADING=False + +# Load the minimal number of samples +export LOAD_MINIMAL_NUM_SAMPLES=0 + +# Load distributed checkpoint directly on GPU +export LOAD_DIRECTLY_ON_DEVICE=0 + + + +# Extract system name +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) + + + +# Configure mlperf SYSJSON logging +export MLPERF_SUBMITTER="Nebius" +export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" +export MLPERF_STATUS="cloud" + + + +# Apply common settings +source $(dirname ${BASH_SOURCE[0]})/config_common.sh + +# Apply FP8 settings +source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh + diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh new file mode 120000 index 00000000..4fef3305 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh @@ -0,0 +1 @@ +config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh \ No newline at end of file diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub b/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub index 1c3645ef..3c0b80e1 100755 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub @@ -236,6 +236,7 @@ if [ -n "${CONTAINER_PRELOAD_SHARED_PATH}" ]; then CONT_FILE="${CONTAINER_PRELOAD_SHARED_PATH}/containers/${SLURM_JOBID}_$(basename ${CONT}).squashfs" mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers" # Prepull container image to the shared filesystem + mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers" srun --ntasks=1 enroot import --output ${CONT_FILE} docker://${CONT} else CONT_FILE=${CONT} diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh index ddf4bc19..550c5762 100755 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh @@ -6,7 +6,7 @@ usage() { echo "usage: ${0} -N [-w ] [-c ]" >&2 echo " [-e ]" >&2 echo " [-i ] [-D ] [-C ] [-R ] [-S ]" >&2 - echo " [-q (quick_start)] [-r (remove_prev_logs)] [-d (debug)] [-h (help)]" >&2 + echo " [-q (quick_start)] [-r (remove_prev_logs)] [-d (debug)] [-p (nsys_profiling)] [-h (help)]" >&2 exit 1 } @@ -16,7 +16,7 @@ dataset_dir="/mlperf-data/gpt3-dataset-4.0" checkpoint_dir="/mlperf-data/gpt3-checkpoint-4.0" result_dir="./result" -while getopts N:w:c:e:i:D:C:R:S:qrdh flag +while getopts N:w:c:e:i:D:C:R:S:qrdph flag do case "${flag}" in N) nodes=${OPTARG};; @@ -31,6 +31,7 @@ do q) quick_start=1;; r) rmlogs=1;; d) debug=1;; + p) nsys_profiling=1;; h) usage;; *) usage;; esac @@ -116,6 +117,18 @@ if [[ $debug -eq 1 ]]; then export GDRCOPY_LOG_LEVEL=1 fi +if [[ $nsys_profiling -eq 1 ]]; then + # Configure NSYS profiler + export NVTX_FLAG=1 + export PROFILE=True + export PROFILE_START_STEP=10 + export PROFILE_END_STEP=11 + export PROFILE_RANKS="0,1,2,3,4,5,6,7" + + # Early stopping: + export TARGET_LOG_PPL=2.75 +fi + if [ -z "${experiment}" ]; then job_name="gpt3" job_output="gpt3-%j.out" @@ -124,14 +137,18 @@ else job_output="gpt3-%j-${experiment}.out" fi +node_allocation="--nodes=${nodes}" +if [ -n "${nodelist}" ]; then + node_allocation="--nodelist='${nodelist}'" +fi + echo "Submit Slurm job" sbatch \ -t $WALLTIME \ -J "${job_name}" \ --output="${job_output}" \ --export=ALL \ - --nodes="${nodes}" \ - --nodelist="${nodelist}" \ + ${node_allocation} \ --ntasks-per-node="${SBATCH_GPUS_PER_NODE}" \ ${EXCLUSIVE:+--exclusive} \ run.sub diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index 02670d78..d52d99e6 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -163,9 +163,10 @@ resource "helm_release" "slurm_cluster" { nccl_topology_type = var.nccl_topology_type nccl_benchmark = { - enable = var.nccl_benchmark_enable - schedule = var.nccl_benchmark_schedule - min_threshold = var.nccl_benchmark_min_threshold + enable = var.nccl_benchmark_enable + schedule = var.nccl_benchmark_schedule + min_threshold = var.nccl_benchmark_min_threshold + use_infiniband = var.nccl_use_infiniband } nodes = { diff --git a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl index 66457052..c420d949 100644 --- a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl @@ -128,6 +128,7 @@ periodicChecks: schedule: "${nccl_benchmark.schedule}" ncclArguments: thresholdMoreThan: ${nccl_benchmark.min_threshold} + useInfiniband: ${nccl_benchmark.use_infiniband} slurmNodes: accounting: diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index 2eef573b..c9db7d15 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -206,7 +206,13 @@ variable "nccl_benchmark_schedule" { variable "nccl_benchmark_min_threshold" { description = "Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable." type = number - default = 420 + default = 45 +} + +variable "nccl_use_infiniband" { + description = "Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test." + type = bool + default = true } # endregion NCCL benchmark diff --git a/wireguard/locals.tf b/wireguard/locals.tf index ed79b470..8c7a63f2 100644 --- a/wireguard/locals.tf +++ b/wireguard/locals.tf @@ -1,4 +1,21 @@ locals { ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) + + regions_default = { + eu-west1 = { + platform = "cpu-d3" + preset = "16vcpu-64gb" + } + eu-north1 = { + platform = "cpu-e2" + preset = "16vcpu-64gb" + } + } + + current_region_defaults = local.regions_default[var.region] + + platform = coalesce(var.platform, local.current_region_defaults.platform) + preset = coalesce(var.preset, local.current_region_defaults.preset) + } diff --git a/wireguard/main.tf b/wireguard/main.tf index 4bc7ba4d..9b3abc99 100644 --- a/wireguard/main.tf +++ b/wireguard/main.tf @@ -19,8 +19,8 @@ resource "nebius_compute_v1_instance" "wireguard_instance" { ] resources = { - platform = "cpu-e2" - preset = "4vcpu-16gb" + platform = local.platform + preset = local.preset } diff --git a/wireguard/terraform.tfvars b/wireguard/terraform.tfvars index 38986d48..79c25e96 100644 --- a/wireguard/terraform.tfvars +++ b/wireguard/terraform.tfvars @@ -1,8 +1,9 @@ -# parent_id = "" -# subnet_id = "" -# ssh_user_name = "ubuntu" +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "" # Project region +# ssh_user_name = "" # Username you want to use to connect to the nodes # ssh_public_key = { -# key = "put your public ssh key here" -# path = "put path to ssh key here" +# key = "put your public ssh key here" OR +# path = "put path to ssh key here" # } -# public_ip_allocation_id = "" +# public_ip_allocation_id = "" \ No newline at end of file diff --git a/wireguard/variables.tf b/wireguard/variables.tf index 8d14ea3b..1b0d7c9b 100644 --- a/wireguard/variables.tf +++ b/wireguard/variables.tf @@ -1,4 +1,4 @@ -# K8s cluster +# Global parameters variable "parent_id" { description = "Project ID." type = string @@ -9,6 +9,25 @@ variable "subnet_id" { type = string } +variable "region" { + description = "Project region." + type = string +} + + +# Platform +variable "platform" { + description = "Platform for WireGuard host." + type = string + default = null +} + +variable "preset" { + description = "Preset for WireGuard host." + type = string + default = null +} + # SSH access variable "ssh_user_name" { description = "SSH username."